updates
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor additions)
|
||||
# Contributors: matthewmc, clouds56
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
NLTK Tokenizer Package
|
||||
|
||||
Tokenizers divide strings into lists of substrings. For example,
|
||||
tokenizers can be used to find the words and punctuation in a string:
|
||||
|
||||
>>> from nltk.tokenize import word_tokenize
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
This particular tokenizer requires the Punkt sentence tokenization
|
||||
models to be installed. NLTK also provides a simpler,
|
||||
regular-expression based tokenizer, which splits text on whitespace
|
||||
and punctuation:
|
||||
|
||||
>>> from nltk.tokenize import wordpunct_tokenize
|
||||
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
We can also operate at the level of sentences, using the sentence
|
||||
tokenizer directly as follows:
|
||||
|
||||
>>> from nltk.tokenize import sent_tokenize, word_tokenize
|
||||
>>> sent_tokenize(s)
|
||||
['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
|
||||
>>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
|
||||
[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
|
||||
['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
|
||||
|
||||
Caution: when tokenizing a Unicode string, make sure you are not
|
||||
using an encoded version of the string (it may be necessary to
|
||||
decode it first, e.g. with ``s.decode("utf8")``.
|
||||
|
||||
NLTK tokenizers can produce token-spans, represented as tuples of integers
|
||||
having the same semantics as string slices, to support efficient comparison
|
||||
of tokenizers. (These methods are implemented as generators.)
|
||||
|
||||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||||
>>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
|
||||
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
||||
|
||||
There are numerous ways to tokenize text. If you need more control over
|
||||
tokenization, see the other methods provided in this package.
|
||||
|
||||
For further information, please see Chapter 3 of the NLTK book.
|
||||
"""
|
||||
|
||||
import functools
|
||||
import re
|
||||
|
||||
from nltk.data import load
|
||||
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
|
||||
from nltk.tokenize.destructive import NLTKWordTokenizer
|
||||
from nltk.tokenize.legality_principle import LegalitySyllableTokenizer
|
||||
from nltk.tokenize.mwe import MWETokenizer
|
||||
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTokenizer
|
||||
from nltk.tokenize.regexp import (
|
||||
BlanklineTokenizer,
|
||||
RegexpTokenizer,
|
||||
WhitespaceTokenizer,
|
||||
WordPunctTokenizer,
|
||||
blankline_tokenize,
|
||||
regexp_tokenize,
|
||||
wordpunct_tokenize,
|
||||
)
|
||||
from nltk.tokenize.repp import ReppTokenizer
|
||||
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
|
||||
from nltk.tokenize.simple import (
|
||||
LineTokenizer,
|
||||
SpaceTokenizer,
|
||||
TabTokenizer,
|
||||
line_tokenize,
|
||||
)
|
||||
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
|
||||
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
|
||||
from nltk.tokenize.texttiling import TextTilingTokenizer
|
||||
from nltk.tokenize.toktok import ToktokTokenizer
|
||||
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
|
||||
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def _get_punkt_tokenizer(language="english"):
|
||||
"""
|
||||
A constructor for the PunktTokenizer that utilizes
|
||||
a lru cache for performance.
|
||||
|
||||
:param language: the model name in the Punkt corpus
|
||||
:type language: str
|
||||
"""
|
||||
return PunktTokenizer(language)
|
||||
|
||||
|
||||
# Standard sentence tokenizer.
|
||||
def sent_tokenize(text, language="english"):
|
||||
"""
|
||||
Return a sentence-tokenized copy of *text*,
|
||||
using NLTK's recommended sentence tokenizer
|
||||
(currently :class:`.PunktSentenceTokenizer`
|
||||
for the specified language).
|
||||
|
||||
:param text: text to split into sentences
|
||||
:param language: the model name in the Punkt corpus
|
||||
"""
|
||||
tokenizer = _get_punkt_tokenizer(language)
|
||||
return tokenizer.tokenize(text)
|
||||
|
||||
|
||||
# Standard word tokenizer.
|
||||
_treebank_word_tokenizer = NLTKWordTokenizer()
|
||||
|
||||
|
||||
def word_tokenize(text, language="english", preserve_line=False):
|
||||
"""
|
||||
Return a tokenized copy of *text*,
|
||||
using NLTK's recommended word tokenizer
|
||||
(currently an improved :class:`.TreebankWordTokenizer`
|
||||
along with :class:`.PunktSentenceTokenizer`
|
||||
for the specified language).
|
||||
|
||||
:param text: text to split into words
|
||||
:type text: str
|
||||
:param language: the model name in the Punkt corpus
|
||||
:type language: str
|
||||
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
|
||||
:type preserve_line: bool
|
||||
"""
|
||||
sentences = [text] if preserve_line else sent_tokenize(text, language)
|
||||
return [
|
||||
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,83 @@
|
||||
# Natural Language Toolkit: Tokenizer Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Tokenizer Interface
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterator, List, Tuple
|
||||
|
||||
from nltk.internals import overridden
|
||||
from nltk.tokenize.util import string_span_tokenize
|
||||
|
||||
|
||||
class TokenizerI(ABC):
|
||||
"""
|
||||
A processing interface for tokenizing a string.
|
||||
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def tokenize(self, s: str) -> List[str]:
|
||||
"""
|
||||
Return a tokenized copy of *s*.
|
||||
|
||||
:rtype: List[str]
|
||||
"""
|
||||
if overridden(self.tokenize_sents):
|
||||
return self.tokenize_sents([s])[0]
|
||||
|
||||
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
|
||||
"""
|
||||
Identify the tokens using integer offsets ``(start_i, end_i)``,
|
||||
where ``s[start_i:end_i]`` is the corresponding token.
|
||||
|
||||
:rtype: Iterator[Tuple[int, int]]
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
|
||||
"""
|
||||
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
|
||||
|
||||
return [self.tokenize(s) for s in strings]
|
||||
|
||||
:rtype: List[List[str]]
|
||||
"""
|
||||
return [self.tokenize(s) for s in strings]
|
||||
|
||||
def span_tokenize_sents(
|
||||
self, strings: List[str]
|
||||
) -> Iterator[List[Tuple[int, int]]]:
|
||||
"""
|
||||
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
|
||||
|
||||
return [self.span_tokenize(s) for s in strings]
|
||||
|
||||
:yield: List[Tuple[int, int]]
|
||||
"""
|
||||
for s in strings:
|
||||
yield list(self.span_tokenize(s))
|
||||
|
||||
|
||||
class StringTokenizer(TokenizerI):
|
||||
"""A tokenizer that divides a string into substrings by splitting
|
||||
on the specified string (defined in subclasses).
|
||||
"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def _string(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def tokenize(self, s):
|
||||
return s.split(self._string)
|
||||
|
||||
def span_tokenize(self, s):
|
||||
yield from string_span_tokenize(s, self._string)
|
||||
@@ -0,0 +1,458 @@
|
||||
#
|
||||
# Natural Language Toolkit: Twitter Tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Potts <cgpotts@stanford.edu>
|
||||
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
|
||||
# Pierpaolo Pantone <> (modifications)
|
||||
# Tom Aarsen <> (modifications)
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
#
|
||||
|
||||
|
||||
"""
|
||||
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
|
||||
domains and tasks. The basic logic is this:
|
||||
|
||||
1. The tuple REGEXPS defines a list of regular expression
|
||||
strings.
|
||||
|
||||
2. The REGEXPS strings are put, in order, into a compiled
|
||||
regular expression object called WORD_RE, under the TweetTokenizer
|
||||
class.
|
||||
|
||||
3. The tokenization is done by WORD_RE.findall(s), where s is the
|
||||
user-supplied string, inside the tokenize() method of the class
|
||||
TweetTokenizer.
|
||||
|
||||
4. When instantiating Tokenizer objects, there are several options:
|
||||
* preserve_case. By default, it is set to True. If it is set to
|
||||
False, then the tokenizer will downcase everything except for
|
||||
emoticons.
|
||||
* reduce_len. By default, it is set to False. It specifies whether
|
||||
to replace repeated character sequences of length 3 or greater
|
||||
with sequences of length 3.
|
||||
* strip_handles. By default, it is set to False. It specifies
|
||||
whether to remove Twitter handles of text used in the
|
||||
`tokenize` method.
|
||||
* match_phone_numbers. By default, it is set to True. It indicates
|
||||
whether the `tokenize` method should look for phone numbers.
|
||||
"""
|
||||
|
||||
|
||||
######################################################################
|
||||
|
||||
import html
|
||||
from typing import List
|
||||
|
||||
import regex # https://github.com/nltk/nltk/issues/2409
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
######################################################################
|
||||
# The following strings are components in the regular expression
|
||||
# that is used for tokenizing. It's important that phone_number
|
||||
# appears first in the final regex (since it can contain whitespace).
|
||||
# It also could matter that tags comes after emoticons, due to the
|
||||
# possibility of having text like
|
||||
#
|
||||
# <:| and some text >:)
|
||||
#
|
||||
# Most importantly, the final element should always be last, since it
|
||||
# does a last ditch whitespace-based tokenization of whatever is left.
|
||||
|
||||
# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ?
|
||||
|
||||
# This particular element is used in a couple ways, so we define it
|
||||
# with a name:
|
||||
EMOTICONS = r"""
|
||||
(?:
|
||||
[<>]?
|
||||
[:;=8] # eyes
|
||||
[\-o\*\']? # optional nose
|
||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||||
|
|
||||
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
||||
[\-o\*\']? # optional nose
|
||||
[:;=8] # eyes
|
||||
[<>]?
|
||||
|
|
||||
</?3 # heart
|
||||
)"""
|
||||
|
||||
# URL pattern due to John Gruber, modified by Tom Winzig. See
|
||||
# https://gist.github.com/winzig/8894715
|
||||
|
||||
URLS = r""" # Capture 1: entire matched URL
|
||||
(?:
|
||||
https?: # URL protocol and colon
|
||||
(?:
|
||||
/{1,3} # 1-3 slashes
|
||||
| # or
|
||||
[a-z0-9%] # Single letter or digit or '%'
|
||||
# (Trying not to match e.g. "URI::Escape")
|
||||
)
|
||||
| # or
|
||||
# looks like domain name followed by a slash:
|
||||
[a-z0-9.\-]+[.]
|
||||
(?:[a-z]{2,13})
|
||||
/
|
||||
)
|
||||
(?: # One or more:
|
||||
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
|
||||
| # or
|
||||
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
||||
|
|
||||
\([^\s]+?\) # balanced parens, non-recursive: (...)
|
||||
)+
|
||||
(?: # End with:
|
||||
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
||||
|
|
||||
\([^\s]+?\) # balanced parens, non-recursive: (...)
|
||||
| # or
|
||||
[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
|
||||
)
|
||||
| # OR, the following to match naked domains:
|
||||
(?:
|
||||
(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
|
||||
[a-z0-9]+
|
||||
(?:[.\-][a-z0-9]+)*
|
||||
[.]
|
||||
(?:[a-z]{2,13})
|
||||
\b
|
||||
/?
|
||||
(?!@) # not succeeded by a @,
|
||||
# avoid matching "foo.na" in "foo.na@example.com"
|
||||
)
|
||||
"""
|
||||
|
||||
# emoji flag sequence
|
||||
# https://en.wikipedia.org/wiki/Regional_indicator_symbol
|
||||
# For regex simplicity, include all possible enclosed letter pairs,
|
||||
# not the ISO subset of two-letter regional indicator symbols.
|
||||
# See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes
|
||||
# Future regional flag support may be handled with the regex for
|
||||
# U+1F3F4 🏴 followed by emoji tag sequences:
|
||||
# r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F'
|
||||
FLAGS = r"""
|
||||
(?:
|
||||
[\U0001F1E6-\U0001F1FF]{2} # all enclosed letter pairs
|
||||
|
|
||||
# English flag
|
||||
\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
|
||||
|
|
||||
# Scottish flag
|
||||
\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
|
||||
|
|
||||
# For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
|
||||
\U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
|
||||
)
|
||||
"""
|
||||
|
||||
# Regex for recognizing phone numbers:
|
||||
PHONE_REGEX = r"""
|
||||
(?:
|
||||
(?: # (international)
|
||||
\+?[01]
|
||||
[ *\-.\)]*
|
||||
)?
|
||||
(?: # (area code)
|
||||
[\(]?
|
||||
\d{3}
|
||||
[ *\-.\)]*
|
||||
)?
|
||||
\d{3} # exchange
|
||||
[ *\-.\)]*
|
||||
\d{4} # base
|
||||
)"""
|
||||
|
||||
# The components of the tokenizer:
|
||||
REGEXPS = (
|
||||
URLS,
|
||||
# ASCII Emoticons
|
||||
EMOTICONS,
|
||||
# HTML tags:
|
||||
r"""<[^>\s]+>""",
|
||||
# ASCII Arrows
|
||||
r"""[\-]+>|<[\-]+""",
|
||||
# Twitter username:
|
||||
r"""(?:@[\w_]+)""",
|
||||
# Twitter hashtags:
|
||||
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
|
||||
# email addresses
|
||||
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
|
||||
# Zero-Width-Joiner and Skin tone modifier emojis
|
||||
""".(?:
|
||||
[\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
|
||||
|
|
||||
[\U0001F3FB-\U0001F3FF]
|
||||
)""",
|
||||
# flags
|
||||
FLAGS,
|
||||
# Remaining word types:
|
||||
r"""
|
||||
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
|
||||
|
|
||||
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
||||
|
|
||||
(?:[\w_]+) # Words without apostrophes or dashes.
|
||||
|
|
||||
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
||||
|
|
||||
(?:\S) # Everything else that isn't whitespace.
|
||||
""",
|
||||
)
|
||||
|
||||
# Take the main components and add a phone regex as the second parameter
|
||||
REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])
|
||||
|
||||
######################################################################
|
||||
# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
|
||||
# the core tokenizing regexes. They are compiled lazily.
|
||||
|
||||
# WORD_RE performs poorly on these patterns:
|
||||
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
|
||||
|
||||
# The emoticon string gets its own regex so that we can preserve case for
|
||||
# them as needed:
|
||||
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
|
||||
|
||||
# These are for regularizing HTML entities to Unicode:
|
||||
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
|
||||
|
||||
# For stripping away handles from a tweet:
|
||||
HANDLES_RE = regex.compile(
|
||||
r"(?<![A-Za-z0-9_!@#\$%&*])@"
|
||||
r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
|
||||
)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Functions for converting html entities
|
||||
######################################################################
|
||||
|
||||
|
||||
def _str_to_unicode(text, encoding=None, errors="strict"):
|
||||
if encoding is None:
|
||||
encoding = "utf-8"
|
||||
if isinstance(text, bytes):
|
||||
return text.decode(encoding, errors)
|
||||
return text
|
||||
|
||||
|
||||
def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
|
||||
"""
|
||||
Remove entities from text by converting them to their
|
||||
corresponding unicode character.
|
||||
|
||||
:param text: a unicode string or a byte string encoded in the given
|
||||
`encoding` (which defaults to 'utf-8').
|
||||
|
||||
:param list keep: list of entity names which should not be replaced.\
|
||||
This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
|
||||
and named entities (such as `` `` or ``>``).
|
||||
|
||||
:param bool remove_illegal: If `True`, entities that can't be converted are\
|
||||
removed. Otherwise, entities that can't be converted are kept "as
|
||||
is".
|
||||
|
||||
:returns: A unicode string with the entities removed.
|
||||
|
||||
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
|
||||
|
||||
>>> from nltk.tokenize.casual import _replace_html_entities
|
||||
>>> _replace_html_entities(b'Price: £100')
|
||||
'Price: \\xa3100'
|
||||
>>> print(_replace_html_entities(b'Price: £100'))
|
||||
Price: £100
|
||||
>>>
|
||||
"""
|
||||
|
||||
def _convert_entity(match):
|
||||
entity_body = match.group(3)
|
||||
if match.group(1):
|
||||
try:
|
||||
if match.group(2):
|
||||
number = int(entity_body, 16)
|
||||
else:
|
||||
number = int(entity_body, 10)
|
||||
# Numeric character references in the 80-9F range are typically
|
||||
# interpreted by browsers as representing the characters mapped
|
||||
# to bytes 80-9F in the Windows-1252 encoding. For more info
|
||||
# see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
|
||||
if 0x80 <= number <= 0x9F:
|
||||
return bytes((number,)).decode("cp1252")
|
||||
except ValueError:
|
||||
number = None
|
||||
else:
|
||||
if entity_body in keep:
|
||||
return match.group(0)
|
||||
number = html.entities.name2codepoint.get(entity_body)
|
||||
if number is not None:
|
||||
try:
|
||||
return chr(number)
|
||||
except (ValueError, OverflowError):
|
||||
pass
|
||||
|
||||
return "" if remove_illegal else match.group(0)
|
||||
|
||||
return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
|
||||
|
||||
|
||||
######################################################################
|
||||
|
||||
|
||||
class TweetTokenizer(TokenizerI):
|
||||
r"""
|
||||
Tokenizer for tweets.
|
||||
|
||||
>>> from nltk.tokenize import TweetTokenizer
|
||||
>>> tknzr = TweetTokenizer()
|
||||
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
|
||||
>>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
|
||||
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
|
||||
'<--']
|
||||
|
||||
Examples using `strip_handles` and `reduce_len parameters`:
|
||||
|
||||
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
|
||||
>>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
|
||||
>>> tknzr.tokenize(s1)
|
||||
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
|
||||
"""
|
||||
|
||||
# Values used to lazily compile WORD_RE and PHONE_WORD_RE,
|
||||
# which are the core tokenizing regexes.
|
||||
_WORD_RE = None
|
||||
_PHONE_WORD_RE = None
|
||||
|
||||
######################################################################
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
preserve_case=True,
|
||||
reduce_len=False,
|
||||
strip_handles=False,
|
||||
match_phone_numbers=True,
|
||||
):
|
||||
"""
|
||||
Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.
|
||||
|
||||
:param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
|
||||
of text used in the `tokenize` method. Defaults to True.
|
||||
:type preserve_case: bool
|
||||
:param reduce_len: Flag indicating whether to replace repeated character sequences
|
||||
of length 3 or greater with sequences of length 3. Defaults to False.
|
||||
:type reduce_len: bool
|
||||
:param strip_handles: Flag indicating whether to remove Twitter handles of text used
|
||||
in the `tokenize` method. Defaults to False.
|
||||
:type strip_handles: bool
|
||||
:param match_phone_numbers: Flag indicating whether the `tokenize` method should look
|
||||
for phone numbers. Defaults to True.
|
||||
:type match_phone_numbers: bool
|
||||
"""
|
||||
self.preserve_case = preserve_case
|
||||
self.reduce_len = reduce_len
|
||||
self.strip_handles = strip_handles
|
||||
self.match_phone_numbers = match_phone_numbers
|
||||
|
||||
def tokenize(self, text: str) -> List[str]:
|
||||
"""Tokenize the input text.
|
||||
|
||||
:param text: str
|
||||
:rtype: list(str)
|
||||
:return: a tokenized list of strings; joining this list returns\
|
||||
the original string if `preserve_case=False`.
|
||||
"""
|
||||
# Fix HTML character entities:
|
||||
text = _replace_html_entities(text)
|
||||
# Remove username handles
|
||||
if self.strip_handles:
|
||||
text = remove_handles(text)
|
||||
# Normalize word lengthening
|
||||
if self.reduce_len:
|
||||
text = reduce_lengthening(text)
|
||||
# Shorten problematic sequences of characters
|
||||
safe_text = HANG_RE.sub(r"\1\1\1", text)
|
||||
# Recognise phone numbers during tokenization
|
||||
if self.match_phone_numbers:
|
||||
words = self.PHONE_WORD_RE.findall(safe_text)
|
||||
else:
|
||||
words = self.WORD_RE.findall(safe_text)
|
||||
# Possibly alter the case, but avoid changing emoticons like :D into :d:
|
||||
if not self.preserve_case:
|
||||
words = list(
|
||||
map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
|
||||
)
|
||||
return words
|
||||
|
||||
@property
|
||||
def WORD_RE(self) -> "regex.Pattern":
|
||||
"""Core TweetTokenizer regex"""
|
||||
# Compiles the regex for this and all future instantiations of TweetTokenizer.
|
||||
if not type(self)._WORD_RE:
|
||||
type(self)._WORD_RE = regex.compile(
|
||||
f"({'|'.join(REGEXPS)})",
|
||||
regex.VERBOSE | regex.I | regex.UNICODE,
|
||||
)
|
||||
return type(self)._WORD_RE
|
||||
|
||||
@property
|
||||
def PHONE_WORD_RE(self) -> "regex.Pattern":
|
||||
"""Secondary core TweetTokenizer regex"""
|
||||
# Compiles the regex for this and all future instantiations of TweetTokenizer.
|
||||
if not type(self)._PHONE_WORD_RE:
|
||||
type(self)._PHONE_WORD_RE = regex.compile(
|
||||
f"({'|'.join(REGEXPS_PHONE)})",
|
||||
regex.VERBOSE | regex.I | regex.UNICODE,
|
||||
)
|
||||
return type(self)._PHONE_WORD_RE
|
||||
|
||||
|
||||
######################################################################
|
||||
# Normalization Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def reduce_lengthening(text):
|
||||
"""
|
||||
Replace repeated character sequences of length 3 or greater with sequences
|
||||
of length 3.
|
||||
"""
|
||||
pattern = regex.compile(r"(.)\1{2,}")
|
||||
return pattern.sub(r"\1\1\1", text)
|
||||
|
||||
|
||||
def remove_handles(text):
|
||||
"""
|
||||
Remove Twitter username handles from text.
|
||||
"""
|
||||
# Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
|
||||
return HANDLES_RE.sub(" ", text)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Tokenization Function
|
||||
######################################################################
|
||||
|
||||
|
||||
def casual_tokenize(
|
||||
text,
|
||||
preserve_case=True,
|
||||
reduce_len=False,
|
||||
strip_handles=False,
|
||||
match_phone_numbers=True,
|
||||
):
|
||||
"""
|
||||
Convenience function for wrapping the tokenizer.
|
||||
"""
|
||||
return TweetTokenizer(
|
||||
preserve_case=preserve_case,
|
||||
reduce_len=reduce_len,
|
||||
strip_handles=strip_handles,
|
||||
match_phone_numbers=match_phone_numbers,
|
||||
).tokenize(text)
|
||||
|
||||
|
||||
###############################################################################
|
||||
@@ -0,0 +1,234 @@
|
||||
# Natural Language Toolkit: NLTK's very own tokenizer.
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Liling Tan
|
||||
# Tom Aarsen <> (modifications)
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from typing import Iterator, List, Tuple
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.util import align_tokens
|
||||
|
||||
|
||||
class MacIntyreContractions:
|
||||
"""
|
||||
List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
"""
|
||||
|
||||
CONTRACTIONS2 = [
|
||||
r"(?i)\b(can)(?#X)(not)\b",
|
||||
r"(?i)\b(d)(?#X)('ye)\b",
|
||||
r"(?i)\b(gim)(?#X)(me)\b",
|
||||
r"(?i)\b(gon)(?#X)(na)\b",
|
||||
r"(?i)\b(got)(?#X)(ta)\b",
|
||||
r"(?i)\b(lem)(?#X)(me)\b",
|
||||
r"(?i)\b(more)(?#X)('n)\b",
|
||||
r"(?i)\b(wan)(?#X)(na)(?=\s)",
|
||||
]
|
||||
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
|
||||
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
|
||||
|
||||
|
||||
class NLTKWordTokenizer(TokenizerI):
|
||||
"""
|
||||
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
|
||||
|
||||
This is the method that is invoked by ``word_tokenize()``. It assumes that the
|
||||
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
|
||||
|
||||
The tokenizer is "destructive" such that the regexes applied will munge the
|
||||
input string to a state beyond re-construction. It is possible to apply
|
||||
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
|
||||
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
|
||||
revert to the original string.
|
||||
"""
|
||||
|
||||
# Starting quotes.
|
||||
STARTING_QUOTES = [
|
||||
(re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
|
||||
(re.compile(r"^\""), r"``"),
|
||||
(re.compile(r"(``)"), r" \1 "),
|
||||
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
|
||||
(re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
|
||||
]
|
||||
|
||||
# Ending quotes.
|
||||
ENDING_QUOTES = [
|
||||
(re.compile("([»”’])", re.U), r" \1 "),
|
||||
(re.compile(r"''"), " '' "),
|
||||
(re.compile(r'"'), " '' "),
|
||||
(re.compile(r"\s+"), " "),
|
||||
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
|
||||
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
|
||||
]
|
||||
|
||||
# For improvements for starting/closing quotes from TreebankWordTokenizer,
|
||||
# see discussion on https://github.com/nltk/nltk/pull/1437
|
||||
# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
|
||||
# - chevron quotes u'\xab' and u'\xbb'
|
||||
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
|
||||
# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
|
||||
# Also, behavior of splitting on clitics now follows Stanford CoreNLP
|
||||
# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
|
||||
|
||||
# Punctuation.
|
||||
PUNCTUATION = [
|
||||
(re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
|
||||
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
|
||||
(re.compile(r"([:,])$"), r" \1 "),
|
||||
(
|
||||
re.compile(r"\.{2,}", re.U),
|
||||
r" \g<0> ",
|
||||
), # See https://github.com/nltk/nltk/pull/2322
|
||||
(re.compile(r"[;@#$%&]"), r" \g<0> "),
|
||||
(
|
||||
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
|
||||
r"\1 \2\3 ",
|
||||
), # Handles the final period.
|
||||
(re.compile(r"[?!]"), r" \g<0> "),
|
||||
(re.compile(r"([^'])' "), r"\1 ' "),
|
||||
(
|
||||
re.compile(r"[*]", re.U),
|
||||
r" \g<0> ",
|
||||
), # See https://github.com/nltk/nltk/pull/2322
|
||||
]
|
||||
|
||||
# Pads parentheses
|
||||
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
|
||||
|
||||
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
|
||||
CONVERT_PARENTHESES = [
|
||||
(re.compile(r"\("), "-LRB-"),
|
||||
(re.compile(r"\)"), "-RRB-"),
|
||||
(re.compile(r"\["), "-LSB-"),
|
||||
(re.compile(r"\]"), "-RSB-"),
|
||||
(re.compile(r"\{"), "-LCB-"),
|
||||
(re.compile(r"\}"), "-RCB-"),
|
||||
]
|
||||
|
||||
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
|
||||
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
_contractions = MacIntyreContractions()
|
||||
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
|
||||
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
|
||||
|
||||
def tokenize(
|
||||
self, text: str, convert_parentheses: bool = False, return_str: bool = False
|
||||
) -> List[str]:
|
||||
r"""Return a tokenized copy of `text`.
|
||||
|
||||
>>> from nltk.tokenize import NLTKWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
|
||||
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
|
||||
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:param convert_parentheses: if True, replace parentheses to PTB symbols,
|
||||
e.g. `(` to `-LRB-`. Defaults to False.
|
||||
:type convert_parentheses: bool, optional
|
||||
:param return_str: If True, return tokens as space-separated string,
|
||||
defaults to False.
|
||||
:type return_str: bool, optional
|
||||
:return: List of tokens from `text`.
|
||||
:rtype: List[str]
|
||||
"""
|
||||
if return_str:
|
||||
warnings.warn(
|
||||
"Parameter 'return_str' has been deprecated and should no "
|
||||
"longer be used.",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
for regexp, substitution in self.STARTING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp, substitution in self.PUNCTUATION:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles parentheses.
|
||||
regexp, substitution = self.PARENS_BRACKETS
|
||||
text = regexp.sub(substitution, text)
|
||||
# Optionally convert parentheses
|
||||
if convert_parentheses:
|
||||
for regexp, substitution in self.CONVERT_PARENTHESES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles double dash.
|
||||
regexp, substitution = self.DOUBLE_DASHES
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
for regexp, substitution in self.ENDING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp in self.CONTRACTIONS2:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
for regexp in self.CONTRACTIONS3:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
|
||||
# We are not using CONTRACTIONS4 since
|
||||
# they are also commented out in the SED scripts
|
||||
# for regexp in self._contractions.CONTRACTIONS4:
|
||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
||||
|
||||
return text.split()
|
||||
|
||||
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
|
||||
r"""
|
||||
Returns the spans of the tokens in ``text``.
|
||||
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
|
||||
|
||||
>>> from nltk.tokenize import NLTKWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
||||
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
||||
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
||||
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
||||
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
|
||||
True
|
||||
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
||||
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
||||
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
|
||||
True
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:yield: Tuple[int, int]
|
||||
"""
|
||||
raw_tokens = self.tokenize(text)
|
||||
|
||||
# Convert converted quotes back to original double quotes
|
||||
# Do this only if original text contains double quote(s) or double
|
||||
# single-quotes (because '' might be transformed to `` if it is
|
||||
# treated as starting quotes).
|
||||
if ('"' in text) or ("''" in text):
|
||||
# Find double quotes and converted quotes
|
||||
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
|
||||
|
||||
# Replace converted quotes back to double quotes
|
||||
tokens = [
|
||||
matched.pop(0) if tok in ['"', "``", "''"] else tok
|
||||
for tok in raw_tokens
|
||||
]
|
||||
else:
|
||||
tokens = raw_tokens
|
||||
|
||||
yield from align_tokens(tokens, text)
|
||||
@@ -0,0 +1,147 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Hench <chris.l.hench@gmail.com>
|
||||
# Alex Estes
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Legality Principle is a language agnostic principle maintaining that syllable
|
||||
onsets and codas (the beginning and ends of syllables not including the vowel)
|
||||
are only legal if they are found as word onsets or codas in the language. The English
|
||||
word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found
|
||||
word-initially in the English language (Bartlett et al.). This principle was first proposed
|
||||
in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''.
|
||||
|
||||
Kahn further argues that there is a ''strong tendency to syllabify in such a way that
|
||||
initial clusters are of maximal length, consistent with the general constraints on
|
||||
word-initial consonant clusters.'' Consequently, in addition to being legal onsets,
|
||||
the longest legal onset is preferable---''Onset Maximization''.
|
||||
|
||||
The default implementation assumes an English vowel set, but the `vowels` attribute
|
||||
can be set to IPA or any other alphabet's vowel set for the use-case.
|
||||
Both a valid set of vowels as well as a text corpus of words in the language
|
||||
are necessary to determine legal onsets and subsequently syllabify words.
|
||||
|
||||
The legality principle with onset maximization is a universal syllabification algorithm,
|
||||
but that does not mean it performs equally across languages. Bartlett et al. (2009)
|
||||
is a good benchmark for English accuracy if utilizing IPA (pg. 311).
|
||||
|
||||
References:
|
||||
|
||||
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
|
||||
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
|
||||
- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11.
|
||||
- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976).
|
||||
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
|
||||
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
|
||||
Cambridge, MIT Press. pp. 107-136.
|
||||
- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436.
|
||||
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
|
||||
In HLT-NAACL. pp. 308-316.
|
||||
- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley.
|
||||
"""
|
||||
|
||||
from collections import Counter
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class LegalitySyllableTokenizer(TokenizerI):
|
||||
"""
|
||||
Syllabifies words based on the Legality Principle and Onset Maximization.
|
||||
|
||||
>>> from nltk.tokenize import LegalitySyllableTokenizer
|
||||
>>> from nltk import word_tokenize
|
||||
>>> from nltk.corpus import words
|
||||
>>> text = "This is a wonderful sentence."
|
||||
>>> text_words = word_tokenize(text)
|
||||
>>> LP = LegalitySyllableTokenizer(words.words())
|
||||
>>> [LP.tokenize(word) for word in text_words]
|
||||
[['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001
|
||||
):
|
||||
"""
|
||||
:param tokenized_source_text: List of valid tokens in the language
|
||||
:type tokenized_source_text: list(str)
|
||||
:param vowels: Valid vowels in language or IPA representation
|
||||
:type vowels: str
|
||||
:param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset
|
||||
:type legal_frequency_threshold: float
|
||||
"""
|
||||
self.legal_frequency_threshold = legal_frequency_threshold
|
||||
self.vowels = vowels
|
||||
self.legal_onsets = self.find_legal_onsets(tokenized_source_text)
|
||||
|
||||
def find_legal_onsets(self, words):
|
||||
"""
|
||||
Gathers all onsets and then return only those above the frequency threshold
|
||||
|
||||
:param words: List of words in a language
|
||||
:type words: list(str)
|
||||
:return: Set of legal onsets
|
||||
:rtype: set(str)
|
||||
"""
|
||||
onsets = [self.onset(word) for word in words]
|
||||
legal_onsets = [
|
||||
k
|
||||
for k, v in Counter(onsets).items()
|
||||
if (v / len(onsets)) > self.legal_frequency_threshold
|
||||
]
|
||||
return set(legal_onsets)
|
||||
|
||||
def onset(self, word):
|
||||
"""
|
||||
Returns consonant cluster of word, i.e. all characters until the first vowel.
|
||||
|
||||
:param word: Single word or token
|
||||
:type word: str
|
||||
:return: String of characters of onset
|
||||
:rtype: str
|
||||
"""
|
||||
onset = ""
|
||||
for c in word.lower():
|
||||
if c in self.vowels:
|
||||
return onset
|
||||
else:
|
||||
onset += c
|
||||
return onset
|
||||
|
||||
def tokenize(self, token):
|
||||
"""
|
||||
Apply the Legality Principle in combination with
|
||||
Onset Maximization to return a list of syllables.
|
||||
|
||||
:param token: Single word or token
|
||||
:type token: str
|
||||
:return syllable_list: Single word or token broken up into syllables.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
syllables = []
|
||||
syllable, current_onset = "", ""
|
||||
vowel, onset = False, False
|
||||
for char in token[::-1]:
|
||||
char_lower = char.lower()
|
||||
if not vowel:
|
||||
syllable += char
|
||||
vowel = bool(char_lower in self.vowels)
|
||||
else:
|
||||
if char_lower + current_onset[::-1] in self.legal_onsets:
|
||||
syllable += char
|
||||
current_onset += char_lower
|
||||
onset = True
|
||||
elif char_lower in self.vowels and not onset:
|
||||
syllable += char
|
||||
current_onset += char_lower
|
||||
else:
|
||||
syllables.append(syllable)
|
||||
syllable = char
|
||||
current_onset = ""
|
||||
vowel = bool(char_lower in self.vowels)
|
||||
syllables.append(syllable)
|
||||
syllables_ordered = [syllable[::-1] for syllable in syllables][::-1]
|
||||
return syllables_ordered
|
||||
124
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/mwe.py
Normal file
124
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/mwe.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Multi-Word Expression tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Multi-Word Expression Tokenizer
|
||||
|
||||
A ``MWETokenizer`` takes a string which has already been divided into tokens and
|
||||
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
|
||||
of MWEs:
|
||||
|
||||
|
||||
>>> from nltk.tokenize import MWETokenizer
|
||||
|
||||
>>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
|
||||
>>> tokenizer.add_mwe(('in', 'spite', 'of'))
|
||||
|
||||
>>> tokenizer.tokenize('Testing testing testing one two three'.split())
|
||||
['Testing', 'testing', 'testing', 'one', 'two', 'three']
|
||||
|
||||
>>> tokenizer.tokenize('This is a test in spite'.split())
|
||||
['This', 'is', 'a', 'test', 'in', 'spite']
|
||||
|
||||
>>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
|
||||
['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
|
||||
|
||||
"""
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.util import Trie
|
||||
|
||||
|
||||
class MWETokenizer(TokenizerI):
|
||||
"""A tokenizer that processes tokenized text and merges multi-word expressions
|
||||
into single tokens.
|
||||
"""
|
||||
|
||||
def __init__(self, mwes=None, separator="_"):
|
||||
"""Initialize the multi-word tokenizer with a list of expressions and a
|
||||
separator
|
||||
|
||||
:type mwes: list(list(str))
|
||||
:param mwes: A sequence of multi-word expressions to be merged, where
|
||||
each MWE is a sequence of strings.
|
||||
:type separator: str
|
||||
:param separator: String that should be inserted between words in a multi-word
|
||||
expression token. (Default is '_')
|
||||
|
||||
"""
|
||||
if not mwes:
|
||||
mwes = []
|
||||
self._mwes = Trie(mwes)
|
||||
self._separator = separator
|
||||
|
||||
def add_mwe(self, mwe):
|
||||
"""Add a multi-word expression to the lexicon (stored as a word trie)
|
||||
|
||||
We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
|
||||
The key True marks the end of a valid MWE.
|
||||
|
||||
:param mwe: The multi-word expression we're adding into the word trie
|
||||
:type mwe: tuple(str) or list(str)
|
||||
|
||||
:Example:
|
||||
|
||||
>>> tokenizer = MWETokenizer()
|
||||
>>> tokenizer.add_mwe(('a', 'b'))
|
||||
>>> tokenizer.add_mwe(('a', 'b', 'c'))
|
||||
>>> tokenizer.add_mwe(('a', 'x'))
|
||||
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
|
||||
>>> tokenizer._mwes == expected
|
||||
True
|
||||
|
||||
"""
|
||||
self._mwes.insert(mwe)
|
||||
|
||||
def tokenize(self, text):
|
||||
"""
|
||||
|
||||
:param text: A list containing tokenized text
|
||||
:type text: list(str)
|
||||
:return: A list of the tokenized text with multi-words merged together
|
||||
:rtype: list(str)
|
||||
|
||||
:Example:
|
||||
|
||||
>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
|
||||
>>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
|
||||
['An', "hors+d'oeuvre", 'tonight,', 'sir?']
|
||||
|
||||
"""
|
||||
i = 0
|
||||
n = len(text)
|
||||
result = []
|
||||
|
||||
while i < n:
|
||||
if text[i] in self._mwes:
|
||||
# possible MWE match
|
||||
j = i
|
||||
trie = self._mwes
|
||||
last_match = -1
|
||||
while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 :
|
||||
trie = trie[text[j]]
|
||||
j = j + 1
|
||||
if Trie.LEAF in trie:
|
||||
last_match = j
|
||||
else:
|
||||
if last_match > -1:
|
||||
j = last_match
|
||||
|
||||
if Trie.LEAF in trie or last_match > -1:
|
||||
# success!
|
||||
result.append(self._separator.join(text[i:j]))
|
||||
i = j
|
||||
else:
|
||||
# no match, so backtrack
|
||||
result.append(text[i])
|
||||
i += 1
|
||||
else:
|
||||
result.append(text[i])
|
||||
i += 1
|
||||
return result
|
||||
179
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/nist.py
Normal file
179
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/nist.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
|
||||
# Contributors: Ozan Caglayan, Wiktor Stribizew
|
||||
#
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
|
||||
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
|
||||
which was also ported into Python in
|
||||
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
|
||||
"""
|
||||
|
||||
|
||||
import io
|
||||
import re
|
||||
|
||||
from nltk.corpus import perluniprops
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.util import xml_unescape
|
||||
|
||||
|
||||
class NISTTokenizer(TokenizerI):
|
||||
"""
|
||||
This NIST tokenizer is sentence-based instead of the original
|
||||
paragraph-based tokenization from mteval-14.pl; The sentence-based
|
||||
tokenization is consistent with the other tokenizers available in NLTK.
|
||||
|
||||
>>> from nltk.tokenize.nist import NISTTokenizer
|
||||
>>> nist = NISTTokenizer()
|
||||
>>> s = "Good muffins cost $3.88 in New York."
|
||||
>>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
|
||||
>>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
|
||||
>>> nist.tokenize(s, lowercase=False) == expected_cased
|
||||
True
|
||||
>>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased.
|
||||
True
|
||||
|
||||
The international_tokenize() is the preferred function when tokenizing
|
||||
non-european text, e.g.
|
||||
|
||||
>>> from nltk.tokenize.nist import NISTTokenizer
|
||||
>>> nist = NISTTokenizer()
|
||||
|
||||
# Input strings.
|
||||
>>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
|
||||
>>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
|
||||
>>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
|
||||
|
||||
# Expected tokens.
|
||||
>>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
|
||||
>>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
|
||||
>>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
|
||||
|
||||
>>> nist.international_tokenize(albb)[:10] == expected_albb
|
||||
True
|
||||
>>> nist.international_tokenize(amz)[:10] == expected_amz
|
||||
True
|
||||
>>> nist.international_tokenize(rkt)[:10] == expected_rkt
|
||||
True
|
||||
|
||||
# Doctest for patching issue #1926
|
||||
>>> sent = u'this is a foo\u2604sentence.'
|
||||
>>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
|
||||
>>> nist.international_tokenize(sent) == expected_sent
|
||||
True
|
||||
"""
|
||||
|
||||
# Strip "skipped" tags
|
||||
STRIP_SKIP = re.compile("<skipped>"), ""
|
||||
# Strip end-of-line hyphenation and join lines
|
||||
STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
|
||||
# Tokenize punctuation.
|
||||
PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
|
||||
# Tokenize period and comma unless preceded by a digit.
|
||||
PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
|
||||
# Tokenize period and comma unless followed by a digit.
|
||||
PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
|
||||
# Tokenize dash when preceded by a digit
|
||||
DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
|
||||
|
||||
LANG_DEPENDENT_REGEXES = [
|
||||
PUNCT,
|
||||
PERIOD_COMMA_PRECEED,
|
||||
PERIOD_COMMA_FOLLOW,
|
||||
DASH_PRECEED_DIGIT,
|
||||
]
|
||||
|
||||
# Perluniprops characters used in NIST tokenizer.
|
||||
pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
|
||||
pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
|
||||
pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
|
||||
|
||||
# Python regexes needs to escape some special symbols, see
|
||||
# see https://stackoverflow.com/q/45670950/610569
|
||||
number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
|
||||
punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
|
||||
symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
|
||||
|
||||
# Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
|
||||
# (i) strip trailing and heading spaces and
|
||||
# (ii) de-deuplicate spaces.
|
||||
# In Python, this would do: ' '.join(str.strip().split())
|
||||
# Thus, the next two lines were commented out.
|
||||
# Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
|
||||
# Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
|
||||
|
||||
# Pads non-ascii strings with space.
|
||||
NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
|
||||
# Tokenize any punctuation unless followed AND preceded by a digit.
|
||||
PUNCT_1 = (
|
||||
re.compile(f"([{number_regex}])([{punct_regex}])"),
|
||||
"\\1 \\2 ",
|
||||
)
|
||||
PUNCT_2 = (
|
||||
re.compile(f"([{punct_regex}])([{number_regex}])"),
|
||||
" \\1 \\2",
|
||||
)
|
||||
# Tokenize symbols
|
||||
SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "
|
||||
|
||||
INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
|
||||
|
||||
def lang_independent_sub(self, text):
|
||||
"""Performs the language independent string substituitions."""
|
||||
# It's a strange order of regexes.
|
||||
# It'll be better to unescape after STRIP_EOL_HYPHEN
|
||||
# but let's keep it close to the original NIST implementation.
|
||||
regexp, substitution = self.STRIP_SKIP
|
||||
text = regexp.sub(substitution, text)
|
||||
text = xml_unescape(text)
|
||||
regexp, substitution = self.STRIP_EOL_HYPHEN
|
||||
text = regexp.sub(substitution, text)
|
||||
return text
|
||||
|
||||
def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
|
||||
text = str(text)
|
||||
# Language independent regex.
|
||||
text = self.lang_independent_sub(text)
|
||||
# Language dependent regex.
|
||||
if western_lang:
|
||||
# Pad string with whitespace.
|
||||
text = " " + text + " "
|
||||
if lowercase:
|
||||
text = text.lower()
|
||||
for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
|
||||
text = regexp.sub(substitution, text)
|
||||
# Remove contiguous whitespaces.
|
||||
text = " ".join(text.split())
|
||||
# Finally, strips heading and trailing spaces
|
||||
# and converts output string into unicode.
|
||||
text = str(text.strip())
|
||||
return text if return_str else text.split()
|
||||
|
||||
def international_tokenize(
|
||||
self, text, lowercase=False, split_non_ascii=True, return_str=False
|
||||
):
|
||||
text = str(text)
|
||||
# Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
|
||||
# first before unescaping.
|
||||
regexp, substitution = self.STRIP_SKIP
|
||||
text = regexp.sub(substitution, text)
|
||||
regexp, substitution = self.STRIP_EOL_HYPHEN
|
||||
text = regexp.sub(substitution, text)
|
||||
text = xml_unescape(text)
|
||||
|
||||
if lowercase:
|
||||
text = text.lower()
|
||||
|
||||
for regexp, substitution in self.INTERNATIONAL_REGEXES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Make sure that there's only one space only between words.
|
||||
# Strip leading and trailing spaces.
|
||||
text = " ".join(text.strip().split())
|
||||
return text if return_str else text.split()
|
||||
1826
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/punkt.py
Normal file
1826
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/punkt.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,220 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# Trevor Cohn <tacohn@csse.unimelb.edu.au>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
Regular-Expression Tokenizers
|
||||
|
||||
A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
|
||||
For example, the following tokenizer forms tokens out of alphabetic sequences,
|
||||
money expressions, and any other non-whitespace sequences:
|
||||
|
||||
>>> from nltk.tokenize import RegexpTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
|
||||
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
|
||||
|
||||
>>> tokenizer = RegexpTokenizer(r'\s+', gaps=True)
|
||||
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
|
||||
Note that empty tokens are not returned when the delimiter appears at
|
||||
the start or end of the string.
|
||||
|
||||
The material between the tokens is discarded. For example,
|
||||
the following tokenizer selects just the capitalized words:
|
||||
|
||||
>>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
|
||||
>>> capword_tokenizer.tokenize(s)
|
||||
['Good', 'New', 'York', 'Please', 'Thanks']
|
||||
|
||||
This module contains several subclasses of ``RegexpTokenizer``
|
||||
that use pre-defined regular expressions.
|
||||
|
||||
>>> from nltk.tokenize import BlanklineTokenizer
|
||||
>>> # Uses '\s*\n\s*\n\s*':
|
||||
>>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.',
|
||||
'Thanks.']
|
||||
|
||||
All of the regular expression tokenizers are also available as functions:
|
||||
|
||||
>>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
|
||||
>>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
|
||||
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
>>> blankline_tokenize(s)
|
||||
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.']
|
||||
|
||||
Caution: The function ``regexp_tokenize()`` takes the text as its
|
||||
first argument, and the regular expression pattern as its second
|
||||
argument. This differs from the conventions used by Python's
|
||||
``re`` functions, where the pattern is always the first argument.
|
||||
(This is for consistency with the other NLTK tokenizers.)
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.util import regexp_span_tokenize
|
||||
|
||||
|
||||
class RegexpTokenizer(TokenizerI):
|
||||
r"""
|
||||
A tokenizer that splits a string using a regular expression, which
|
||||
matches either the tokens or the separators between tokens.
|
||||
|
||||
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
|
||||
|
||||
:type pattern: str
|
||||
:param pattern: The pattern used to build this tokenizer.
|
||||
(This pattern must not contain capturing parentheses;
|
||||
Use non-capturing parentheses, e.g. (?:...), instead)
|
||||
:type gaps: bool
|
||||
:param gaps: True if this tokenizer's pattern should be used
|
||||
to find separators between tokens; False if this
|
||||
tokenizer's pattern should be used to find the tokens
|
||||
themselves.
|
||||
:type discard_empty: bool
|
||||
:param discard_empty: True if any empty tokens `''`
|
||||
generated by the tokenizer should be discarded. Empty
|
||||
tokens can only be generated if `_gaps == True`.
|
||||
:type flags: int
|
||||
:param flags: The regexp flags used to compile this
|
||||
tokenizer's pattern. By default, the following flags are
|
||||
used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pattern,
|
||||
gaps=False,
|
||||
discard_empty=True,
|
||||
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
|
||||
):
|
||||
# If they gave us a regexp object, extract the pattern.
|
||||
pattern = getattr(pattern, "pattern", pattern)
|
||||
|
||||
self._pattern = pattern
|
||||
self._gaps = gaps
|
||||
self._discard_empty = discard_empty
|
||||
self._flags = flags
|
||||
self._regexp = None
|
||||
|
||||
def _check_regexp(self):
|
||||
if self._regexp is None:
|
||||
self._regexp = re.compile(self._pattern, self._flags)
|
||||
|
||||
def tokenize(self, text):
|
||||
self._check_regexp()
|
||||
# If our regexp matches gaps, use re.split:
|
||||
if self._gaps:
|
||||
if self._discard_empty:
|
||||
return [tok for tok in self._regexp.split(text) if tok]
|
||||
else:
|
||||
return self._regexp.split(text)
|
||||
|
||||
# If our regexp matches tokens, use re.findall:
|
||||
else:
|
||||
return self._regexp.findall(text)
|
||||
|
||||
def span_tokenize(self, text):
|
||||
self._check_regexp()
|
||||
|
||||
if self._gaps:
|
||||
for left, right in regexp_span_tokenize(text, self._regexp):
|
||||
if not (self._discard_empty and left == right):
|
||||
yield left, right
|
||||
else:
|
||||
for m in re.finditer(self._regexp, text):
|
||||
yield m.span()
|
||||
|
||||
def __repr__(self):
|
||||
return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format(
|
||||
self.__class__.__name__,
|
||||
self._pattern,
|
||||
self._gaps,
|
||||
self._discard_empty,
|
||||
self._flags,
|
||||
)
|
||||
|
||||
|
||||
class WhitespaceTokenizer(RegexpTokenizer):
|
||||
r"""
|
||||
Tokenize a string on whitespace (space, tab, newline).
|
||||
In general, users should use the string ``split()`` method instead.
|
||||
|
||||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
RegexpTokenizer.__init__(self, r"\s+", gaps=True)
|
||||
|
||||
|
||||
class BlanklineTokenizer(RegexpTokenizer):
|
||||
"""
|
||||
Tokenize a string, treating any sequence of blank lines as a delimiter.
|
||||
Blank lines are defined as lines containing no characters, except for
|
||||
space or tab characters.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
|
||||
|
||||
|
||||
class WordPunctTokenizer(RegexpTokenizer):
|
||||
r"""
|
||||
Tokenize a text into a sequence of alphabetic and
|
||||
non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
|
||||
|
||||
>>> from nltk.tokenize import WordPunctTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
|
||||
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Tokenization Functions
|
||||
######################################################################
|
||||
|
||||
|
||||
def regexp_tokenize(
|
||||
text,
|
||||
pattern,
|
||||
gaps=False,
|
||||
discard_empty=True,
|
||||
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
|
||||
):
|
||||
"""
|
||||
Return a tokenized copy of *text*. See :class:`.RegexpTokenizer`
|
||||
for descriptions of the arguments.
|
||||
"""
|
||||
tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
|
||||
return tokenizer.tokenize(text)
|
||||
|
||||
|
||||
blankline_tokenize = BlanklineTokenizer().tokenize
|
||||
wordpunct_tokenize = WordPunctTokenizer().tokenize
|
||||
149
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/repp.py
Normal file
149
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/repp.py
Normal file
@@ -0,0 +1,149 @@
|
||||
# Natural Language Toolkit: Interface to the Repp Tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Authors: Rebecca Dridan and Stephan Oepen
|
||||
# Contributors: Liling Tan
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from nltk.data import ZipFilePathPointer
|
||||
from nltk.internals import find_dir
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class ReppTokenizer(TokenizerI):
|
||||
"""
|
||||
A class for word tokenization using the REPP parser described in
|
||||
Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
|
||||
Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
|
||||
and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
|
||||
|
||||
>>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
|
||||
... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
|
||||
... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
|
||||
... ]
|
||||
>>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
|
||||
>>> for sent in sents: # doctest: +SKIP
|
||||
... tokenizer.tokenize(sent) # doctest: +SKIP
|
||||
...
|
||||
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
|
||||
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
|
||||
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
|
||||
|
||||
>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
|
||||
... print(sent) # doctest: +SKIP
|
||||
...
|
||||
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
|
||||
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
|
||||
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
|
||||
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
|
||||
... print(sent) # doctest: +SKIP
|
||||
...
|
||||
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
|
||||
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
|
||||
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
|
||||
"""
|
||||
|
||||
def __init__(self, repp_dir, encoding="utf8"):
|
||||
self.repp_dir = self.find_repptokenizer(repp_dir)
|
||||
# Set a directory to store the temporary files.
|
||||
self.working_dir = tempfile.gettempdir()
|
||||
# Set an encoding for the input strings.
|
||||
self.encoding = encoding
|
||||
|
||||
def tokenize(self, sentence):
|
||||
"""
|
||||
Use Repp to tokenize a single sentence.
|
||||
|
||||
:param sentence: A single sentence string.
|
||||
:type sentence: str
|
||||
:return: A tuple of tokens.
|
||||
:rtype: tuple(str)
|
||||
"""
|
||||
return next(self.tokenize_sents([sentence]))
|
||||
|
||||
def tokenize_sents(self, sentences, keep_token_positions=False):
|
||||
"""
|
||||
Tokenize multiple sentences using Repp.
|
||||
|
||||
:param sentences: A list of sentence strings.
|
||||
:type sentences: list(str)
|
||||
:return: A list of tuples of tokens
|
||||
:rtype: iter(tuple(str))
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
|
||||
) as input_file:
|
||||
# Write sentences to temporary input file.
|
||||
for sent in sentences:
|
||||
input_file.write(str(sent) + "\n")
|
||||
input_file.close()
|
||||
# Generate command to run REPP.
|
||||
cmd = self.generate_repp_command(input_file.name)
|
||||
# Decode the stdout and strips the ending newline.
|
||||
repp_output = self._execute(cmd).decode(self.encoding).strip()
|
||||
for tokenized_sent in self.parse_repp_outputs(repp_output):
|
||||
if not keep_token_positions:
|
||||
# Removes token position information.
|
||||
tokenized_sent, starts, ends = zip(*tokenized_sent)
|
||||
yield tokenized_sent
|
||||
|
||||
def generate_repp_command(self, inputfilename):
|
||||
"""
|
||||
This module generates the REPP command to be used at the terminal.
|
||||
|
||||
:param inputfilename: path to the input file
|
||||
:type inputfilename: str
|
||||
"""
|
||||
cmd = [self.repp_dir + "/src/repp"]
|
||||
cmd += ["-c", self.repp_dir + "/erg/repp.set"]
|
||||
cmd += ["--format", "triple"]
|
||||
cmd += [inputfilename]
|
||||
return cmd
|
||||
|
||||
@staticmethod
|
||||
def _execute(cmd):
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = p.communicate()
|
||||
return stdout
|
||||
|
||||
@staticmethod
|
||||
def parse_repp_outputs(repp_output):
|
||||
"""
|
||||
This module parses the tri-tuple format that REPP outputs using the
|
||||
"--format triple" option and returns an generator with tuple of string
|
||||
tokens.
|
||||
|
||||
:param repp_output:
|
||||
:type repp_output: type
|
||||
:return: an iterable of the tokenized sentences as tuples of strings
|
||||
:rtype: iter(tuple)
|
||||
"""
|
||||
line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
|
||||
for section in repp_output.split("\n\n"):
|
||||
words_with_positions = [
|
||||
(token, int(start), int(end))
|
||||
for start, end, token in line_regex.findall(section)
|
||||
]
|
||||
words = tuple(t[2] for t in words_with_positions)
|
||||
yield words_with_positions
|
||||
|
||||
def find_repptokenizer(self, repp_dirname):
|
||||
"""
|
||||
A module to find REPP tokenizer binary and its *repp.set* config file.
|
||||
"""
|
||||
if os.path.exists(repp_dirname): # If a full path is given.
|
||||
_repp_dir = repp_dirname
|
||||
else: # Try to find path to REPP directory in environment variables.
|
||||
_repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
|
||||
# Checks for the REPP binary and erg/repp.set config file.
|
||||
assert os.path.exists(_repp_dir + "/src/repp")
|
||||
assert os.path.exists(_repp_dir + "/erg/repp.set")
|
||||
return _repp_dir
|
||||
140
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/sexpr.py
Normal file
140
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/sexpr.py
Normal file
@@ -0,0 +1,140 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
|
||||
# Steven Bird <stevenbird1@gmail.com> (minor edits)
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
S-Expression Tokenizer
|
||||
|
||||
``SExprTokenizer`` is used to find parenthesized expressions in a
|
||||
string. In particular, it divides a string into a sequence of
|
||||
substrings that are either parenthesized expressions (including any
|
||||
nested parenthesized expressions), or other whitespace-separated
|
||||
tokens.
|
||||
|
||||
>>> from nltk.tokenize import SExprTokenizer
|
||||
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
|
||||
['(a b (c d))', 'e', 'f', '(g)']
|
||||
|
||||
By default, `SExprTokenizer` will raise a ``ValueError`` exception if
|
||||
used to tokenize an expression with non-matching parentheses:
|
||||
|
||||
>>> SExprTokenizer().tokenize('c) d) e (f (g')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Un-matched close paren at char 1
|
||||
|
||||
The ``strict`` argument can be set to False to allow for
|
||||
non-matching parentheses. Any unmatched close parentheses will be
|
||||
listed as their own s-expression; and the last partial sexpr with
|
||||
unmatched open parentheses will be listed as its own sexpr:
|
||||
|
||||
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
|
||||
['c', ')', 'd', ')', 'e', '(f (g']
|
||||
|
||||
The characters used for open and close parentheses may be customized
|
||||
using the ``parens`` argument to the `SExprTokenizer` constructor:
|
||||
|
||||
>>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
|
||||
['{a b {c d}}', 'e', 'f', '{g}']
|
||||
|
||||
The s-expression tokenizer is also available as a function:
|
||||
|
||||
>>> from nltk.tokenize import sexpr_tokenize
|
||||
>>> sexpr_tokenize('(a b (c d)) e f (g)')
|
||||
['(a b (c d))', 'e', 'f', '(g)']
|
||||
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class SExprTokenizer(TokenizerI):
|
||||
"""
|
||||
A tokenizer that divides strings into s-expressions.
|
||||
An s-expresion can be either:
|
||||
|
||||
- a parenthesized expression, including any nested parenthesized
|
||||
expressions, or
|
||||
- a sequence of non-whitespace non-parenthesis characters.
|
||||
|
||||
For example, the string ``(a (b c)) d e (f)`` consists of four
|
||||
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
|
||||
|
||||
By default, the characters ``(`` and ``)`` are treated as open and
|
||||
close parentheses, but alternative strings may be specified.
|
||||
|
||||
:param parens: A two-element sequence specifying the open and close parentheses
|
||||
that should be used to find sexprs. This will typically be either a
|
||||
two-character string, or a list of two strings.
|
||||
:type parens: str or list
|
||||
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
|
||||
"""
|
||||
|
||||
def __init__(self, parens="()", strict=True):
|
||||
if len(parens) != 2:
|
||||
raise ValueError("parens must contain exactly two strings")
|
||||
self._strict = strict
|
||||
self._open_paren = parens[0]
|
||||
self._close_paren = parens[1]
|
||||
self._paren_regexp = re.compile(
|
||||
f"{re.escape(parens[0])}|{re.escape(parens[1])}"
|
||||
)
|
||||
|
||||
def tokenize(self, text):
|
||||
"""
|
||||
Return a list of s-expressions extracted from *text*.
|
||||
For example:
|
||||
|
||||
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
|
||||
['(a b (c d))', 'e', 'f', '(g)']
|
||||
|
||||
All parentheses are assumed to mark s-expressions.
|
||||
(No special processing is done to exclude parentheses that occur
|
||||
inside strings, or following backslash characters.)
|
||||
|
||||
If the given expression contains non-matching parentheses,
|
||||
then the behavior of the tokenizer depends on the ``strict``
|
||||
parameter to the constructor. If ``strict`` is ``True``, then
|
||||
raise a ``ValueError``. If ``strict`` is ``False``, then any
|
||||
unmatched close parentheses will be listed as their own
|
||||
s-expression; and the last partial s-expression with unmatched open
|
||||
parentheses will be listed as its own s-expression:
|
||||
|
||||
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
|
||||
['c', ')', 'd', ')', 'e', '(f (g']
|
||||
|
||||
:param text: the string to be tokenized
|
||||
:type text: str or iter(str)
|
||||
:rtype: iter(str)
|
||||
"""
|
||||
result = []
|
||||
pos = 0
|
||||
depth = 0
|
||||
for m in self._paren_regexp.finditer(text):
|
||||
paren = m.group()
|
||||
if depth == 0:
|
||||
result += text[pos : m.start()].split()
|
||||
pos = m.start()
|
||||
if paren == self._open_paren:
|
||||
depth += 1
|
||||
if paren == self._close_paren:
|
||||
if self._strict and depth == 0:
|
||||
raise ValueError("Un-matched close paren at char %d" % m.start())
|
||||
depth = max(0, depth - 1)
|
||||
if depth == 0:
|
||||
result.append(text[pos : m.end()])
|
||||
pos = m.end()
|
||||
if self._strict and depth > 0:
|
||||
raise ValueError("Un-matched open paren at char %d" % pos)
|
||||
if pos < len(text):
|
||||
result.append(text[pos:])
|
||||
return result
|
||||
|
||||
|
||||
sexpr_tokenize = SExprTokenizer().tokenize
|
||||
@@ -0,0 +1,139 @@
|
||||
# Natural Language Toolkit: Simple Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
Simple Tokenizers
|
||||
|
||||
These tokenizers divide strings into substrings using the string
|
||||
``split()`` method.
|
||||
When tokenizing using a particular delimiter string, use
|
||||
the string ``split()`` method directly, as this is more efficient.
|
||||
|
||||
The simple tokenizers are *not* available as separate functions;
|
||||
instead, you should just use the string ``split()`` method directly:
|
||||
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> s.split() # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
>>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
|
||||
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
|
||||
>>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', '', 'Thanks.']
|
||||
|
||||
The simple tokenizers are mainly useful because they follow the
|
||||
standard ``TokenizerI`` interface, and so can be used with any code
|
||||
that expects a tokenizer. For example, these tokenizers can be used
|
||||
to specify the tokenization conventions when building a `CorpusReader`.
|
||||
|
||||
"""
|
||||
|
||||
from nltk.tokenize.api import StringTokenizer, TokenizerI
|
||||
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
|
||||
|
||||
|
||||
class SpaceTokenizer(StringTokenizer):
|
||||
r"""Tokenize a string using the space character as a delimiter,
|
||||
which is the same as ``s.split(' ')``.
|
||||
|
||||
>>> from nltk.tokenize import SpaceTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
|
||||
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
|
||||
"""
|
||||
|
||||
_string = " "
|
||||
|
||||
|
||||
class TabTokenizer(StringTokenizer):
|
||||
r"""Tokenize a string use the tab character as a delimiter,
|
||||
the same as ``s.split('\t')``.
|
||||
|
||||
>>> from nltk.tokenize import TabTokenizer
|
||||
>>> TabTokenizer().tokenize('a\tb c\n\t d')
|
||||
['a', 'b c\n', ' d']
|
||||
"""
|
||||
|
||||
_string = "\t"
|
||||
|
||||
|
||||
class CharTokenizer(StringTokenizer):
|
||||
"""Tokenize a string into individual characters. If this functionality
|
||||
is ever required directly, use ``for char in string``.
|
||||
"""
|
||||
|
||||
_string = None
|
||||
|
||||
def tokenize(self, s):
|
||||
return list(s)
|
||||
|
||||
def span_tokenize(self, s):
|
||||
yield from enumerate(range(1, len(s) + 1))
|
||||
|
||||
|
||||
class LineTokenizer(TokenizerI):
|
||||
r"""Tokenize a string into its lines, optionally discarding blank lines.
|
||||
This is similar to ``s.split('\n')``.
|
||||
|
||||
>>> from nltk.tokenize import LineTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', '', 'Thanks.']
|
||||
>>> # same as [l for l in s.split('\n') if l.strip()]:
|
||||
>>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', 'Thanks.']
|
||||
|
||||
:param blanklines: Indicates how blank lines should be handled. Valid values are:
|
||||
|
||||
- ``discard``: strip blank lines out of the token list before returning it.
|
||||
A line is considered blank if it contains only whitespace characters.
|
||||
- ``keep``: leave all blank lines in the token list.
|
||||
- ``discard-eof``: if the string ends with a newline, then do not generate
|
||||
a corresponding token ``''`` after that newline.
|
||||
"""
|
||||
|
||||
def __init__(self, blanklines="discard"):
|
||||
valid_blanklines = ("discard", "keep", "discard-eof")
|
||||
if blanklines not in valid_blanklines:
|
||||
raise ValueError(
|
||||
"Blank lines must be one of: %s" % " ".join(valid_blanklines)
|
||||
)
|
||||
|
||||
self._blanklines = blanklines
|
||||
|
||||
def tokenize(self, s):
|
||||
lines = s.splitlines()
|
||||
# If requested, strip off blank lines.
|
||||
if self._blanklines == "discard":
|
||||
lines = [l for l in lines if l.rstrip()]
|
||||
elif self._blanklines == "discard-eof":
|
||||
if lines and not lines[-1].strip():
|
||||
lines.pop()
|
||||
return lines
|
||||
|
||||
# discard-eof not implemented
|
||||
def span_tokenize(self, s):
|
||||
if self._blanklines == "keep":
|
||||
yield from string_span_tokenize(s, r"\n")
|
||||
else:
|
||||
yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Tokenization Functions
|
||||
######################################################################
|
||||
# XXX: it is stated in module docs that there is no function versions
|
||||
|
||||
|
||||
def line_tokenize(text, blanklines="discard"):
|
||||
return LineTokenizer(blanklines).tokenize(text)
|
||||
@@ -0,0 +1,194 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Christopher Hench <chris.l.hench@gmail.com>
|
||||
# Alex Estes
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
|
||||
by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
|
||||
openness of the lips. Syllable breaks occur before troughs in sonority. For more
|
||||
on the SSP see Selkirk (1984).
|
||||
|
||||
The default implementation uses the English alphabet, but the `sonority_hiearchy`
|
||||
can be modified to IPA or any other alphabet for the use-case. The SSP is a
|
||||
universal syllabification algorithm, but that does not mean it performs equally
|
||||
across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
|
||||
if utilizing IPA (pg. 311).
|
||||
|
||||
Importantly, if a custom hierarchy is supplied and vowels span across more than
|
||||
one level, they should be given separately to the `vowels` class attribute.
|
||||
|
||||
References:
|
||||
|
||||
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
|
||||
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
|
||||
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
|
||||
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
|
||||
Cambridge, MIT Press. pp. 107-136.
|
||||
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
|
||||
In HLT-NAACL. pp. 308-316.
|
||||
"""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from string import punctuation
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.util import ngrams
|
||||
|
||||
|
||||
class SyllableTokenizer(TokenizerI):
|
||||
"""
|
||||
Syllabifies words based on the Sonority Sequencing Principle (SSP).
|
||||
|
||||
>>> from nltk.tokenize import SyllableTokenizer
|
||||
>>> from nltk import word_tokenize
|
||||
>>> SSP = SyllableTokenizer()
|
||||
>>> SSP.tokenize('justification')
|
||||
['jus', 'ti', 'fi', 'ca', 'tion']
|
||||
>>> text = "This is a foobar-like sentence."
|
||||
>>> [SSP.tokenize(token) for token in word_tokenize(text)]
|
||||
[['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
|
||||
"""
|
||||
|
||||
def __init__(self, lang="en", sonority_hierarchy=False):
|
||||
"""
|
||||
:param lang: Language parameter, default is English, 'en'
|
||||
:type lang: str
|
||||
:param sonority_hierarchy: Sonority hierarchy according to the
|
||||
Sonority Sequencing Principle.
|
||||
:type sonority_hierarchy: list(str)
|
||||
"""
|
||||
# Sonority hierarchy should be provided in descending order.
|
||||
# If vowels are spread across multiple levels, they should be
|
||||
# passed assigned self.vowels var together, otherwise should be
|
||||
# placed in first index of hierarchy.
|
||||
if not sonority_hierarchy and lang == "en":
|
||||
sonority_hierarchy = [
|
||||
"aeiouy", # vowels.
|
||||
"lmnrw", # nasals.
|
||||
"zvsf", # fricatives.
|
||||
"bcdgtkpqxhj", # stops.
|
||||
]
|
||||
|
||||
self.vowels = sonority_hierarchy[0]
|
||||
self.phoneme_map = {}
|
||||
for i, level in enumerate(sonority_hierarchy):
|
||||
for c in level:
|
||||
sonority_level = len(sonority_hierarchy) - i
|
||||
self.phoneme_map[c] = sonority_level
|
||||
self.phoneme_map[c.upper()] = sonority_level
|
||||
|
||||
def assign_values(self, token):
|
||||
"""
|
||||
Assigns each phoneme its value from the sonority hierarchy.
|
||||
Note: Sentence/text has to be tokenized first.
|
||||
|
||||
:param token: Single word or token
|
||||
:type token: str
|
||||
:return: List of tuples, first element is character/phoneme and
|
||||
second is the soronity value.
|
||||
:rtype: list(tuple(str, int))
|
||||
"""
|
||||
syllables_values = []
|
||||
for c in token:
|
||||
try:
|
||||
syllables_values.append((c, self.phoneme_map[c]))
|
||||
except KeyError:
|
||||
if c not in "0123456789" and c not in punctuation:
|
||||
warnings.warn(
|
||||
"Character not defined in sonority_hierarchy,"
|
||||
" assigning as vowel: '{}'".format(c)
|
||||
)
|
||||
syllables_values.append((c, max(self.phoneme_map.values())))
|
||||
if c not in self.vowels:
|
||||
self.vowels += c
|
||||
else: # If it's a punctuation or numbers, assign -1.
|
||||
syllables_values.append((c, -1))
|
||||
return syllables_values
|
||||
|
||||
def validate_syllables(self, syllable_list):
|
||||
"""
|
||||
Ensures each syllable has at least one vowel.
|
||||
If the following syllable doesn't have vowel, add it to the current one.
|
||||
|
||||
:param syllable_list: Single word or token broken up into syllables.
|
||||
:type syllable_list: list(str)
|
||||
:return: Single word or token broken up into syllables
|
||||
(with added syllables if necessary)
|
||||
:rtype: list(str)
|
||||
"""
|
||||
valid_syllables = []
|
||||
front = ""
|
||||
vowel_pattern = re.compile("|".join(self.vowels))
|
||||
for i, syllable in enumerate(syllable_list):
|
||||
if syllable in punctuation:
|
||||
valid_syllables.append(syllable)
|
||||
continue
|
||||
if not vowel_pattern.search(syllable):
|
||||
if len(valid_syllables) == 0:
|
||||
front += syllable
|
||||
else:
|
||||
valid_syllables = valid_syllables[:-1] + [
|
||||
valid_syllables[-1] + syllable
|
||||
]
|
||||
else:
|
||||
if len(valid_syllables) == 0:
|
||||
valid_syllables.append(front + syllable)
|
||||
else:
|
||||
valid_syllables.append(syllable)
|
||||
|
||||
return valid_syllables
|
||||
|
||||
def tokenize(self, token):
|
||||
"""
|
||||
Apply the SSP to return a list of syllables.
|
||||
Note: Sentence/text has to be tokenized first.
|
||||
|
||||
:param token: Single word or token
|
||||
:type token: str
|
||||
:return syllable_list: Single word or token broken up into syllables.
|
||||
:rtype: list(str)
|
||||
"""
|
||||
# assign values from hierarchy
|
||||
syllables_values = self.assign_values(token)
|
||||
|
||||
# if only one vowel return word
|
||||
if sum(token.count(x) for x in self.vowels) <= 1:
|
||||
return [token]
|
||||
|
||||
syllable_list = []
|
||||
syllable = syllables_values[0][0] # start syllable with first phoneme
|
||||
for trigram in ngrams(syllables_values, n=3):
|
||||
phonemes, values = zip(*trigram)
|
||||
# Sonority of previous, focal and following phoneme
|
||||
prev_value, focal_value, next_value = values
|
||||
# Focal phoneme.
|
||||
focal_phoneme = phonemes[1]
|
||||
|
||||
# These cases trigger syllable break.
|
||||
if focal_value == -1: # If it's a punctuation, just break.
|
||||
syllable_list.append(syllable)
|
||||
syllable_list.append(focal_phoneme)
|
||||
syllable = ""
|
||||
elif prev_value >= focal_value == next_value:
|
||||
syllable += focal_phoneme
|
||||
syllable_list.append(syllable)
|
||||
syllable = ""
|
||||
|
||||
elif prev_value > focal_value < next_value:
|
||||
syllable_list.append(syllable)
|
||||
syllable = ""
|
||||
syllable += focal_phoneme
|
||||
|
||||
# no syllable break
|
||||
else:
|
||||
syllable += focal_phoneme
|
||||
|
||||
syllable += syllables_values[-1][0] # append last phoneme
|
||||
syllable_list.append(syllable)
|
||||
|
||||
return self.validate_syllables(syllable_list)
|
||||
@@ -0,0 +1,115 @@
|
||||
# Natural Language Toolkit: Interface to the Stanford Tokenizer
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Xu <xxu@student.unimelb.edu.au>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import warnings
|
||||
from subprocess import PIPE
|
||||
|
||||
from nltk.internals import _java_options, config_java, find_jar, java
|
||||
from nltk.parse.corenlp import CoreNLPParser
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
|
||||
|
||||
|
||||
class StanfordTokenizer(TokenizerI):
|
||||
r"""
|
||||
Interface to the Stanford Tokenizer
|
||||
|
||||
>>> from nltk.tokenize.stanford import StanfordTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
|
||||
>>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
>>> s = "The colour of the wall is blue."
|
||||
>>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
|
||||
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
|
||||
"""
|
||||
|
||||
_JAR = "stanford-postagger.jar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
encoding="utf8",
|
||||
options=None,
|
||||
verbose=False,
|
||||
java_options="-mx1000m",
|
||||
):
|
||||
# Raise deprecation warning.
|
||||
warnings.warn(
|
||||
str(
|
||||
"\nThe StanfordTokenizer will "
|
||||
"be deprecated in version 3.2.5.\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
|
||||
),
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
self._stanford_jar = find_jar(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("STANFORD_POSTAGGER",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
self._encoding = encoding
|
||||
self.java_options = java_options
|
||||
|
||||
options = {} if options is None else options
|
||||
self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items())
|
||||
|
||||
@staticmethod
|
||||
def _parse_tokenized_output(s):
|
||||
return s.splitlines()
|
||||
|
||||
def tokenize(self, s):
|
||||
"""
|
||||
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
|
||||
"""
|
||||
cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
|
||||
return self._parse_tokenized_output(self._execute(cmd, s))
|
||||
|
||||
def _execute(self, cmd, input_, verbose=False):
|
||||
encoding = self._encoding
|
||||
cmd.extend(["-charset", encoding])
|
||||
_options_cmd = self._options_cmd
|
||||
if _options_cmd:
|
||||
cmd.extend(["-options", self._options_cmd])
|
||||
|
||||
default_options = " ".join(_java_options)
|
||||
|
||||
# Configure java.
|
||||
config_java(options=self.java_options, verbose=verbose)
|
||||
|
||||
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
|
||||
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
|
||||
# Write the actual sentences to the temporary input file
|
||||
if isinstance(input_, str) and encoding:
|
||||
input_ = input_.encode(encoding)
|
||||
input_file.write(input_)
|
||||
input_file.flush()
|
||||
|
||||
cmd.append(input_file.name)
|
||||
|
||||
# Run the tagger and get the output.
|
||||
stdout, stderr = java(
|
||||
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
stdout = stdout.decode(encoding)
|
||||
|
||||
os.unlink(input_file.name)
|
||||
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=False)
|
||||
|
||||
return stdout
|
||||
@@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env python
|
||||
# Natural Language Toolkit: Interface to the Stanford Segmenter
|
||||
# for Chinese and Arabic
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: 52nlp <52nlpcn@gmail.com>
|
||||
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
|
||||
# Alex Constantin <alex@keyworder.ch>
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import warnings
|
||||
from subprocess import PIPE
|
||||
|
||||
from nltk.internals import (
|
||||
_java_options,
|
||||
config_java,
|
||||
find_dir,
|
||||
find_file,
|
||||
find_jar,
|
||||
java,
|
||||
)
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
_stanford_url = "https://nlp.stanford.edu/software"
|
||||
|
||||
|
||||
class StanfordSegmenter(TokenizerI):
|
||||
"""Interface to the Stanford Segmenter
|
||||
|
||||
If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
|
||||
should be provieded, for example::
|
||||
|
||||
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
|
||||
|
||||
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
|
||||
>>> seg = StanfordSegmenter() # doctest: +SKIP
|
||||
>>> seg.default_config('zh') # doctest: +SKIP
|
||||
>>> sent = u'这是斯坦福中文分词器测试'
|
||||
>>> print(seg.segment(sent)) # doctest: +SKIP
|
||||
\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
|
||||
<BLANKLINE>
|
||||
>>> seg.default_config('ar') # doctest: +SKIP
|
||||
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
|
||||
>>> print(seg.segment(sent.split())) # doctest: +SKIP
|
||||
\u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
|
||||
<BLANKLINE>
|
||||
"""
|
||||
|
||||
_JAR = "stanford-segmenter.jar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
path_to_jar=None,
|
||||
path_to_slf4j=None,
|
||||
java_class=None,
|
||||
path_to_model=None,
|
||||
path_to_dict=None,
|
||||
path_to_sihan_corpora_dict=None,
|
||||
sihan_post_processing="false",
|
||||
keep_whitespaces="false",
|
||||
encoding="UTF-8",
|
||||
options=None,
|
||||
verbose=False,
|
||||
java_options="-mx2g",
|
||||
):
|
||||
# Raise deprecation warning.
|
||||
warnings.simplefilter("always", DeprecationWarning)
|
||||
warnings.warn(
|
||||
str(
|
||||
"\nThe StanfordTokenizer will "
|
||||
"be deprecated in version 3.2.5.\n"
|
||||
"Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
|
||||
),
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
|
||||
stanford_segmenter = find_jar(
|
||||
self._JAR,
|
||||
path_to_jar,
|
||||
env_vars=("STANFORD_SEGMENTER",),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
)
|
||||
if path_to_slf4j is not None:
|
||||
slf4j = find_jar(
|
||||
"slf4j-api.jar",
|
||||
path_to_slf4j,
|
||||
env_vars=("SLF4J", "STANFORD_SEGMENTER"),
|
||||
searchpath=(),
|
||||
url=_stanford_url,
|
||||
verbose=verbose,
|
||||
)
|
||||
else:
|
||||
slf4j = None
|
||||
|
||||
# This is passed to java as the -cp option, the old version of segmenter needs slf4j.
|
||||
# The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
|
||||
self._stanford_jar = os.pathsep.join(
|
||||
_ for _ in [stanford_segmenter, slf4j] if _ is not None
|
||||
)
|
||||
|
||||
self._java_class = java_class
|
||||
self._model = path_to_model
|
||||
self._sihan_corpora_dict = path_to_sihan_corpora_dict
|
||||
self._sihan_post_processing = sihan_post_processing
|
||||
self._keep_whitespaces = keep_whitespaces
|
||||
self._dict = path_to_dict
|
||||
|
||||
self._encoding = encoding
|
||||
self.java_options = java_options
|
||||
options = {} if options is None else options
|
||||
self._options_cmd = ",".join(
|
||||
f"{key}={json.dumps(val)}" for key, val in options.items()
|
||||
)
|
||||
|
||||
def default_config(self, lang):
|
||||
"""
|
||||
Attempt to initialize Stanford Word Segmenter for the specified language
|
||||
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
|
||||
"""
|
||||
|
||||
search_path = ()
|
||||
if os.environ.get("STANFORD_SEGMENTER"):
|
||||
search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
|
||||
|
||||
# init for Chinese-specific files
|
||||
self._dict = None
|
||||
self._sihan_corpora_dict = None
|
||||
self._sihan_post_processing = "false"
|
||||
|
||||
if lang == "ar":
|
||||
self._java_class = (
|
||||
"edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
|
||||
)
|
||||
model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
|
||||
|
||||
elif lang == "zh":
|
||||
self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
|
||||
model = "pku.gz"
|
||||
self._sihan_post_processing = "true"
|
||||
|
||||
path_to_dict = "dict-chris6.ser.gz"
|
||||
try:
|
||||
self._dict = find_file(
|
||||
path_to_dict,
|
||||
searchpath=search_path,
|
||||
url=_stanford_url,
|
||||
verbose=False,
|
||||
env_vars=("STANFORD_MODELS",),
|
||||
)
|
||||
except LookupError as e:
|
||||
raise LookupError(
|
||||
"Could not find '%s' (tried using env. "
|
||||
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
|
||||
% path_to_dict
|
||||
) from e
|
||||
|
||||
sihan_dir = "./data/"
|
||||
try:
|
||||
path_to_sihan_dir = find_dir(
|
||||
sihan_dir,
|
||||
url=_stanford_url,
|
||||
verbose=False,
|
||||
env_vars=("STANFORD_SEGMENTER",),
|
||||
)
|
||||
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
|
||||
except LookupError as e:
|
||||
raise LookupError(
|
||||
"Could not find '%s' (tried using the "
|
||||
"STANFORD_SEGMENTER environment variable)" % sihan_dir
|
||||
) from e
|
||||
else:
|
||||
raise LookupError(f"Unsupported language {lang}")
|
||||
|
||||
try:
|
||||
self._model = find_file(
|
||||
model,
|
||||
searchpath=search_path,
|
||||
url=_stanford_url,
|
||||
verbose=False,
|
||||
env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
|
||||
)
|
||||
except LookupError as e:
|
||||
raise LookupError(
|
||||
"Could not find '%s' (tried using env. "
|
||||
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
|
||||
) from e
|
||||
|
||||
def tokenize(self, s):
|
||||
super().tokenize(s)
|
||||
|
||||
def segment_file(self, input_file_path):
|
||||
""" """
|
||||
cmd = [
|
||||
self._java_class,
|
||||
"-loadClassifier",
|
||||
self._model,
|
||||
"-keepAllWhitespaces",
|
||||
self._keep_whitespaces,
|
||||
"-textFile",
|
||||
input_file_path,
|
||||
]
|
||||
if self._sihan_corpora_dict is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"-serDictionary",
|
||||
self._dict,
|
||||
"-sighanCorporaDict",
|
||||
self._sihan_corpora_dict,
|
||||
"-sighanPostProcessing",
|
||||
self._sihan_post_processing,
|
||||
]
|
||||
)
|
||||
|
||||
stdout = self._execute(cmd)
|
||||
|
||||
return stdout
|
||||
|
||||
def segment(self, tokens):
|
||||
return self.segment_sents([tokens])
|
||||
|
||||
def segment_sents(self, sentences):
|
||||
""" """
|
||||
encoding = self._encoding
|
||||
# Create a temporary input file
|
||||
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
|
||||
|
||||
# Write the actural sentences to the temporary input file
|
||||
_input_fh = os.fdopen(_input_fh, "wb")
|
||||
_input = "\n".join(" ".join(x) for x in sentences)
|
||||
if isinstance(_input, str) and encoding:
|
||||
_input = _input.encode(encoding)
|
||||
_input_fh.write(_input)
|
||||
_input_fh.close()
|
||||
|
||||
cmd = [
|
||||
self._java_class,
|
||||
"-loadClassifier",
|
||||
self._model,
|
||||
"-keepAllWhitespaces",
|
||||
self._keep_whitespaces,
|
||||
"-textFile",
|
||||
self._input_file_path,
|
||||
]
|
||||
if self._sihan_corpora_dict is not None:
|
||||
cmd.extend(
|
||||
[
|
||||
"-serDictionary",
|
||||
self._dict,
|
||||
"-sighanCorporaDict",
|
||||
self._sihan_corpora_dict,
|
||||
"-sighanPostProcessing",
|
||||
self._sihan_post_processing,
|
||||
]
|
||||
)
|
||||
|
||||
stdout = self._execute(cmd)
|
||||
|
||||
# Delete the temporary file
|
||||
os.unlink(self._input_file_path)
|
||||
|
||||
return stdout
|
||||
|
||||
def _execute(self, cmd, verbose=False):
|
||||
encoding = self._encoding
|
||||
cmd.extend(["-inputEncoding", encoding])
|
||||
_options_cmd = self._options_cmd
|
||||
if _options_cmd:
|
||||
cmd.extend(["-options", self._options_cmd])
|
||||
|
||||
default_options = " ".join(_java_options)
|
||||
|
||||
# Configure java.
|
||||
config_java(options=self.java_options, verbose=verbose)
|
||||
|
||||
stdout, _stderr = java(
|
||||
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
|
||||
)
|
||||
stdout = stdout.decode(encoding)
|
||||
|
||||
# Return java configurations to their default values.
|
||||
config_java(options=default_options, verbose=False)
|
||||
|
||||
return stdout
|
||||
@@ -0,0 +1,474 @@
|
||||
# Natural Language Toolkit: TextTiling
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: George Boutsioukis
|
||||
#
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
import math
|
||||
import re
|
||||
|
||||
try:
|
||||
import numpy
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1
|
||||
LC, HC = 0, 1
|
||||
DEFAULT_SMOOTHING = [0]
|
||||
|
||||
|
||||
class TextTilingTokenizer(TokenizerI):
|
||||
"""Tokenize a document into topical sections using the TextTiling algorithm.
|
||||
This algorithm detects subtopic shifts based on the analysis of lexical
|
||||
co-occurrence patterns.
|
||||
|
||||
The process starts by tokenizing the text into pseudosentences of
|
||||
a fixed size w. Then, depending on the method used, similarity
|
||||
scores are assigned at sentence gaps. The algorithm proceeds by
|
||||
detecting the peak differences between these scores and marking
|
||||
them as boundaries. The boundaries are normalized to the closest
|
||||
paragraph break and the segmented text is returned.
|
||||
|
||||
:param w: Pseudosentence size
|
||||
:type w: int
|
||||
:param k: Size (in sentences) of the block used in the block comparison method
|
||||
:type k: int
|
||||
:param similarity_method: The method used for determining similarity scores:
|
||||
`BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
|
||||
:type similarity_method: constant
|
||||
:param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
|
||||
:type stopwords: list(str)
|
||||
:param smoothing_method: The method used for smoothing the score plot:
|
||||
`DEFAULT_SMOOTHING` (default)
|
||||
:type smoothing_method: constant
|
||||
:param smoothing_width: The width of the window used by the smoothing method
|
||||
:type smoothing_width: int
|
||||
:param smoothing_rounds: The number of smoothing passes
|
||||
:type smoothing_rounds: int
|
||||
:param cutoff_policy: The policy used to determine the number of boundaries:
|
||||
`HC` (default) or `LC`
|
||||
:type cutoff_policy: constant
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> tt = TextTilingTokenizer(demo_mode=True)
|
||||
>>> text = brown.raw()[:4000]
|
||||
>>> s, ss, d, b = tt.tokenize(text)
|
||||
>>> b
|
||||
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
w=20,
|
||||
k=10,
|
||||
similarity_method=BLOCK_COMPARISON,
|
||||
stopwords=None,
|
||||
smoothing_method=DEFAULT_SMOOTHING,
|
||||
smoothing_width=2,
|
||||
smoothing_rounds=1,
|
||||
cutoff_policy=HC,
|
||||
demo_mode=False,
|
||||
):
|
||||
if stopwords is None:
|
||||
from nltk.corpus import stopwords
|
||||
|
||||
stopwords = stopwords.words("english")
|
||||
self.__dict__.update(locals())
|
||||
del self.__dict__["self"]
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Return a tokenized copy of *text*, where each "token" represents
|
||||
a separate topic."""
|
||||
|
||||
lowercase_text = text.lower()
|
||||
paragraph_breaks = self._mark_paragraph_breaks(text)
|
||||
text_length = len(lowercase_text)
|
||||
|
||||
# Tokenization step starts here
|
||||
|
||||
# Remove punctuation
|
||||
nopunct_text = "".join(
|
||||
c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c)
|
||||
)
|
||||
nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
|
||||
|
||||
tokseqs = self._divide_to_tokensequences(nopunct_text)
|
||||
|
||||
# The morphological stemming step mentioned in the TextTile
|
||||
# paper is not implemented. A comment in the original C
|
||||
# implementation states that it offers no benefit to the
|
||||
# process. It might be interesting to test the existing
|
||||
# stemmers though.
|
||||
# words = _stem_words(words)
|
||||
|
||||
# Filter stopwords
|
||||
for ts in tokseqs:
|
||||
ts.wrdindex_list = [
|
||||
wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords
|
||||
]
|
||||
|
||||
token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
|
||||
# End of the Tokenization step
|
||||
|
||||
# Lexical score determination
|
||||
if self.similarity_method == BLOCK_COMPARISON:
|
||||
gap_scores = self._block_comparison(tokseqs, token_table)
|
||||
elif self.similarity_method == VOCABULARY_INTRODUCTION:
|
||||
raise NotImplementedError("Vocabulary introduction not implemented")
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Similarity method {self.similarity_method} not recognized"
|
||||
)
|
||||
|
||||
if self.smoothing_method == DEFAULT_SMOOTHING:
|
||||
smooth_scores = self._smooth_scores(gap_scores)
|
||||
else:
|
||||
raise ValueError(f"Smoothing method {self.smoothing_method} not recognized")
|
||||
# End of Lexical score Determination
|
||||
|
||||
# Boundary identification
|
||||
depth_scores = self._depth_scores(smooth_scores)
|
||||
segment_boundaries = self._identify_boundaries(depth_scores)
|
||||
|
||||
normalized_boundaries = self._normalize_boundaries(
|
||||
text, segment_boundaries, paragraph_breaks
|
||||
)
|
||||
# End of Boundary Identification
|
||||
segmented_text = []
|
||||
prevb = 0
|
||||
|
||||
for b in normalized_boundaries:
|
||||
if b == 0:
|
||||
continue
|
||||
segmented_text.append(text[prevb:b])
|
||||
prevb = b
|
||||
|
||||
if prevb < text_length: # append any text that may be remaining
|
||||
segmented_text.append(text[prevb:])
|
||||
|
||||
if not segmented_text:
|
||||
segmented_text = [text]
|
||||
|
||||
if self.demo_mode:
|
||||
return gap_scores, smooth_scores, depth_scores, segment_boundaries
|
||||
return segmented_text
|
||||
|
||||
def _block_comparison(self, tokseqs, token_table):
|
||||
"""Implements the block comparison method"""
|
||||
|
||||
def blk_frq(tok, block):
|
||||
ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences)
|
||||
freq = sum(tsocc[1] for tsocc in ts_occs)
|
||||
return freq
|
||||
|
||||
gap_scores = []
|
||||
numgaps = len(tokseqs) - 1
|
||||
|
||||
for curr_gap in range(numgaps):
|
||||
score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
|
||||
score = 0.0
|
||||
# adjust window size for boundary conditions
|
||||
if curr_gap < self.k - 1:
|
||||
window_size = curr_gap + 1
|
||||
elif curr_gap > numgaps - self.k:
|
||||
window_size = numgaps - curr_gap
|
||||
else:
|
||||
window_size = self.k
|
||||
|
||||
b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]]
|
||||
b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]]
|
||||
|
||||
for t in token_table:
|
||||
score_dividend += blk_frq(t, b1) * blk_frq(t, b2)
|
||||
score_divisor_b1 += blk_frq(t, b1) ** 2
|
||||
score_divisor_b2 += blk_frq(t, b2) ** 2
|
||||
try:
|
||||
score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2)
|
||||
except ZeroDivisionError:
|
||||
pass # score += 0.0
|
||||
|
||||
gap_scores.append(score)
|
||||
|
||||
return gap_scores
|
||||
|
||||
def _smooth_scores(self, gap_scores):
|
||||
"Wraps the smooth function from the SciPy Cookbook"
|
||||
return list(
|
||||
smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1)
|
||||
)
|
||||
|
||||
def _mark_paragraph_breaks(self, text):
|
||||
"""Identifies indented text or line breaks as the beginning of
|
||||
paragraphs"""
|
||||
MIN_PARAGRAPH = 100
|
||||
pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*")
|
||||
matches = pattern.finditer(text)
|
||||
|
||||
last_break = 0
|
||||
pbreaks = [0]
|
||||
for pb in matches:
|
||||
if pb.start() - last_break < MIN_PARAGRAPH:
|
||||
continue
|
||||
else:
|
||||
pbreaks.append(pb.start())
|
||||
last_break = pb.start()
|
||||
|
||||
return pbreaks
|
||||
|
||||
def _divide_to_tokensequences(self, text):
|
||||
"Divides the text into pseudosentences of fixed size"
|
||||
w = self.w
|
||||
wrdindex_list = []
|
||||
matches = re.finditer(r"\w+", text)
|
||||
for match in matches:
|
||||
wrdindex_list.append((match.group(), match.start()))
|
||||
return [
|
||||
TokenSequence(i / w, wrdindex_list[i : i + w])
|
||||
for i in range(0, len(wrdindex_list), w)
|
||||
]
|
||||
|
||||
def _create_token_table(self, token_sequences, par_breaks):
|
||||
"Creates a table of TokenTableFields"
|
||||
token_table = {}
|
||||
current_par = 0
|
||||
current_tok_seq = 0
|
||||
pb_iter = par_breaks.__iter__()
|
||||
current_par_break = next(pb_iter)
|
||||
if current_par_break == 0:
|
||||
try:
|
||||
current_par_break = next(pb_iter) # skip break at 0
|
||||
except StopIteration as e:
|
||||
raise ValueError(
|
||||
"No paragraph breaks were found(text too short perhaps?)"
|
||||
) from e
|
||||
for ts in token_sequences:
|
||||
for word, index in ts.wrdindex_list:
|
||||
try:
|
||||
while index > current_par_break:
|
||||
current_par_break = next(pb_iter)
|
||||
current_par += 1
|
||||
except StopIteration:
|
||||
# hit bottom
|
||||
pass
|
||||
|
||||
if word in token_table:
|
||||
token_table[word].total_count += 1
|
||||
|
||||
if token_table[word].last_par != current_par:
|
||||
token_table[word].last_par = current_par
|
||||
token_table[word].par_count += 1
|
||||
|
||||
if token_table[word].last_tok_seq != current_tok_seq:
|
||||
token_table[word].last_tok_seq = current_tok_seq
|
||||
token_table[word].ts_occurences.append([current_tok_seq, 1])
|
||||
else:
|
||||
token_table[word].ts_occurences[-1][1] += 1
|
||||
else: # new word
|
||||
token_table[word] = TokenTableField(
|
||||
first_pos=index,
|
||||
ts_occurences=[[current_tok_seq, 1]],
|
||||
total_count=1,
|
||||
par_count=1,
|
||||
last_par=current_par,
|
||||
last_tok_seq=current_tok_seq,
|
||||
)
|
||||
|
||||
current_tok_seq += 1
|
||||
|
||||
return token_table
|
||||
|
||||
def _identify_boundaries(self, depth_scores):
|
||||
"""Identifies boundaries at the peaks of similarity score
|
||||
differences"""
|
||||
|
||||
boundaries = [0 for x in depth_scores]
|
||||
|
||||
avg = sum(depth_scores) / len(depth_scores)
|
||||
stdev = numpy.std(depth_scores)
|
||||
|
||||
if self.cutoff_policy == LC:
|
||||
cutoff = avg - stdev
|
||||
else:
|
||||
cutoff = avg - stdev / 2.0
|
||||
|
||||
depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
|
||||
depth_tuples.reverse()
|
||||
hp = list(filter(lambda x: x[0] > cutoff, depth_tuples))
|
||||
|
||||
for dt in hp:
|
||||
boundaries[dt[1]] = 1
|
||||
for dt2 in hp: # undo if there is a boundary close already
|
||||
if (
|
||||
dt[1] != dt2[1]
|
||||
and abs(dt2[1] - dt[1]) < 4
|
||||
and boundaries[dt2[1]] == 1
|
||||
):
|
||||
boundaries[dt[1]] = 0
|
||||
return boundaries
|
||||
|
||||
def _depth_scores(self, scores):
|
||||
"""Calculates the depth of each gap, i.e. the average difference
|
||||
between the left and right peaks and the gap's score"""
|
||||
|
||||
depth_scores = [0 for x in scores]
|
||||
# clip boundaries: this holds on the rule of thumb(my thumb)
|
||||
# that a section shouldn't be smaller than at least 2
|
||||
# pseudosentences for small texts and around 5 for larger ones.
|
||||
|
||||
clip = min(max(len(scores) // 10, 2), 5)
|
||||
index = clip
|
||||
|
||||
for gapscore in scores[clip:-clip]:
|
||||
lpeak = gapscore
|
||||
for score in scores[index::-1]:
|
||||
if score >= lpeak:
|
||||
lpeak = score
|
||||
else:
|
||||
break
|
||||
rpeak = gapscore
|
||||
for score in scores[index:]:
|
||||
if score >= rpeak:
|
||||
rpeak = score
|
||||
else:
|
||||
break
|
||||
depth_scores[index] = lpeak + rpeak - 2 * gapscore
|
||||
index += 1
|
||||
|
||||
return depth_scores
|
||||
|
||||
def _normalize_boundaries(self, text, boundaries, paragraph_breaks):
|
||||
"""Normalize the boundaries identified to the original text's
|
||||
paragraph breaks"""
|
||||
|
||||
norm_boundaries = []
|
||||
char_count, word_count, gaps_seen = 0, 0, 0
|
||||
seen_word = False
|
||||
|
||||
for char in text:
|
||||
char_count += 1
|
||||
if char in " \t\n" and seen_word:
|
||||
seen_word = False
|
||||
word_count += 1
|
||||
if char not in " \t\n" and not seen_word:
|
||||
seen_word = True
|
||||
if gaps_seen < len(boundaries) and word_count > (
|
||||
max(gaps_seen * self.w, self.w)
|
||||
):
|
||||
if boundaries[gaps_seen] == 1:
|
||||
# find closest paragraph break
|
||||
best_fit = len(text)
|
||||
for br in paragraph_breaks:
|
||||
if best_fit > abs(br - char_count):
|
||||
best_fit = abs(br - char_count)
|
||||
bestbr = br
|
||||
else:
|
||||
break
|
||||
if bestbr not in norm_boundaries: # avoid duplicates
|
||||
norm_boundaries.append(bestbr)
|
||||
gaps_seen += 1
|
||||
|
||||
return norm_boundaries
|
||||
|
||||
|
||||
class TokenTableField:
|
||||
"""A field in the token table holding parameters for each token,
|
||||
used later in the process"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
first_pos,
|
||||
ts_occurences,
|
||||
total_count=1,
|
||||
par_count=1,
|
||||
last_par=0,
|
||||
last_tok_seq=None,
|
||||
):
|
||||
self.__dict__.update(locals())
|
||||
del self.__dict__["self"]
|
||||
|
||||
|
||||
class TokenSequence:
|
||||
"A token list with its original length and its index"
|
||||
|
||||
def __init__(self, index, wrdindex_list, original_length=None):
|
||||
original_length = original_length or len(wrdindex_list)
|
||||
self.__dict__.update(locals())
|
||||
del self.__dict__["self"]
|
||||
|
||||
|
||||
# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth
|
||||
def smooth(x, window_len=11, window="flat"):
|
||||
"""smooth the data using a window with requested size.
|
||||
|
||||
This method is based on the convolution of a scaled window with the signal.
|
||||
The signal is prepared by introducing reflected copies of the signal
|
||||
(with the window size) in both ends so that transient parts are minimized
|
||||
in the beginning and end part of the output signal.
|
||||
|
||||
:param x: the input signal
|
||||
:param window_len: the dimension of the smoothing window; should be an odd integer
|
||||
:param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
|
||||
flat window will produce a moving average smoothing.
|
||||
|
||||
:return: the smoothed signal
|
||||
|
||||
example::
|
||||
|
||||
t=linspace(-2,2,0.1)
|
||||
x=sin(t)+randn(len(t))*0.1
|
||||
y=smooth(x)
|
||||
|
||||
:see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
|
||||
scipy.signal.lfilter
|
||||
|
||||
TODO: the window parameter could be the window itself if an array instead of a string
|
||||
"""
|
||||
|
||||
if x.ndim != 1:
|
||||
raise ValueError("smooth only accepts 1 dimension arrays.")
|
||||
|
||||
if x.size < window_len:
|
||||
raise ValueError("Input vector needs to be bigger than window size.")
|
||||
|
||||
if window_len < 3:
|
||||
return x
|
||||
|
||||
if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
|
||||
raise ValueError(
|
||||
"Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
|
||||
)
|
||||
|
||||
s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
|
||||
|
||||
# print(len(s))
|
||||
if window == "flat": # moving average
|
||||
w = numpy.ones(window_len, "d")
|
||||
else:
|
||||
w = eval("numpy." + window + "(window_len)")
|
||||
|
||||
y = numpy.convolve(w / w.sum(), s, mode="same")
|
||||
|
||||
return y[window_len - 1 : -window_len + 1]
|
||||
|
||||
|
||||
def demo(text=None):
|
||||
from matplotlib import pylab
|
||||
|
||||
from nltk.corpus import brown
|
||||
|
||||
tt = TextTilingTokenizer(demo_mode=True)
|
||||
if text is None:
|
||||
text = brown.raw()[:10000]
|
||||
s, ss, d, b = tt.tokenize(text)
|
||||
pylab.xlabel("Sentence Gap index")
|
||||
pylab.ylabel("Gap Scores")
|
||||
pylab.plot(range(len(s)), s, label="Gap Scores")
|
||||
pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
|
||||
pylab.plot(range(len(d)), d, label="Depth scores")
|
||||
pylab.stem(range(len(b)), b)
|
||||
pylab.legend()
|
||||
pylab.show()
|
||||
@@ -0,0 +1,180 @@
|
||||
# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
|
||||
#
|
||||
# Copyright (C) 2001-2015 NLTK Project
|
||||
# Author: Jon Dehdari
|
||||
# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters,
|
||||
# Alex Rudnick
|
||||
#
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
|
||||
sentence per line; thus only final period is tokenized.
|
||||
|
||||
Tok-tok has been tested on, and gives reasonably good results for English,
|
||||
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
|
||||
The input should be in UTF-8 encoding.
|
||||
|
||||
Reference:
|
||||
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
|
||||
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
|
||||
|
||||
class ToktokTokenizer(TokenizerI):
|
||||
"""
|
||||
This is a Python port of the tok-tok.pl from
|
||||
https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
|
||||
|
||||
>>> toktok = ToktokTokenizer()
|
||||
>>> text = u'Is 9.5 or 525,600 my favorite number?'
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
Is 9.5 or 525,600 my favorite number ?
|
||||
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
|
||||
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
|
||||
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
|
||||
>>> assert toktok.tokenize(text, return_str=True) == expected
|
||||
>>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
|
||||
True
|
||||
"""
|
||||
|
||||
# Replace non-breaking spaces with normal spaces.
|
||||
NON_BREAKING = re.compile("\u00A0"), " "
|
||||
|
||||
# Pad some funky punctuation.
|
||||
FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
|
||||
# Pad more funky punctuation.
|
||||
FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 "
|
||||
# Pad En dash and em dash
|
||||
EN_EM_DASHES = re.compile("([–—])"), r" \1 "
|
||||
|
||||
# Replace problematic character with numeric character reference.
|
||||
AMPERCENT = re.compile("& "), "& "
|
||||
TAB = re.compile("\t"), " 	 "
|
||||
PIPE = re.compile(r"\|"), " | "
|
||||
|
||||
# Pad numbers with commas to keep them from further tokenization.
|
||||
COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
|
||||
|
||||
# Just pad problematic (often neurotic) hyphen/single quote, etc.
|
||||
PROB_SINGLE_QUOTES = re.compile(r"(['’`])"), r" \1 "
|
||||
# Group ` ` stupid quotes ' ' into a single token.
|
||||
STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
|
||||
STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
|
||||
|
||||
# Don't tokenize period unless it ends the line and that it isn't
|
||||
# preceded by another period, e.g.
|
||||
# "something ..." -> "something ..."
|
||||
# "something." -> "something ."
|
||||
FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
|
||||
# Don't tokenize period unless it ends the line eg.
|
||||
# " ... stuff." -> "... stuff ."
|
||||
FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
|
||||
|
||||
# Treat continuous commas as fake German,Czech, etc.: „
|
||||
MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
|
||||
# Treat continuous dashes as fake en-dash, etc.
|
||||
MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
|
||||
# Treat multiple periods as a thing (eg. ellipsis)
|
||||
MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
|
||||
|
||||
# This is the \p{Open_Punctuation} from Perl's perluniprops
|
||||
# see https://perldoc.perl.org/perluniprops.html
|
||||
OPEN_PUNCT = str(
|
||||
"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
|
||||
"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
|
||||
"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
|
||||
"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
|
||||
"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
|
||||
"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
|
||||
"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
|
||||
"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
|
||||
"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
|
||||
)
|
||||
# This is the \p{Close_Punctuation} from Perl's perluniprops
|
||||
CLOSE_PUNCT = str(
|
||||
")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
|
||||
"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
|
||||
"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
|
||||
"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
|
||||
"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
|
||||
"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
|
||||
"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
|
||||
"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
|
||||
"\uff09\uff3d\uff5d\uff60\uff63"
|
||||
)
|
||||
# This is the \p{Close_Punctuation} from Perl's perluniprops
|
||||
CURRENCY_SYM = str(
|
||||
"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
|
||||
"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
|
||||
"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
|
||||
"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
|
||||
"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
|
||||
"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
|
||||
)
|
||||
|
||||
# Pad spaces after opening punctuations.
|
||||
OPEN_PUNCT_RE = re.compile(f"([{OPEN_PUNCT}])"), r"\1 "
|
||||
# Pad spaces before closing punctuations.
|
||||
CLOSE_PUNCT_RE = re.compile(f"([{CLOSE_PUNCT}])"), r"\1 "
|
||||
# Pad spaces after currency symbols.
|
||||
CURRENCY_SYM_RE = re.compile(f"([{CURRENCY_SYM}])"), r"\1 "
|
||||
|
||||
# Use for tokenizing URL-unfriendly characters: [:/?#]
|
||||
URL_FOE_1 = re.compile(r":(?!//)"), r" : " # in perl s{:(?!//)}{ : }g;
|
||||
URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? " # in perl s{\?(?!\S)}{ ? }g;
|
||||
# in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
|
||||
URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
|
||||
URL_FOE_4 = re.compile(r" /"), r" / " # s{ /}{ / }g;
|
||||
|
||||
# Left/Right strip, i.e. remove heading/trailing spaces.
|
||||
# These strip regexes should NOT be used,
|
||||
# instead use str.lstrip(), str.rstrip() or str.strip()
|
||||
# (They are kept for reference purposes to the original toktok.pl code)
|
||||
LSTRIP = re.compile(r"^ +"), ""
|
||||
RSTRIP = re.compile(r"\s+$"), "\n"
|
||||
# Merge multiple spaces.
|
||||
ONE_SPACE = re.compile(r" {2,}"), " "
|
||||
|
||||
TOKTOK_REGEXES = [
|
||||
NON_BREAKING,
|
||||
FUNKY_PUNCT_1,
|
||||
FUNKY_PUNCT_2,
|
||||
URL_FOE_1,
|
||||
URL_FOE_2,
|
||||
URL_FOE_3,
|
||||
URL_FOE_4,
|
||||
AMPERCENT,
|
||||
TAB,
|
||||
PIPE,
|
||||
OPEN_PUNCT_RE,
|
||||
CLOSE_PUNCT_RE,
|
||||
MULTI_COMMAS,
|
||||
COMMA_IN_NUM,
|
||||
PROB_SINGLE_QUOTES,
|
||||
STUPID_QUOTES_1,
|
||||
STUPID_QUOTES_2,
|
||||
CURRENCY_SYM_RE,
|
||||
EN_EM_DASHES,
|
||||
MULTI_DASHES,
|
||||
MULTI_DOTS,
|
||||
FINAL_PERIOD_1,
|
||||
FINAL_PERIOD_2,
|
||||
ONE_SPACE,
|
||||
]
|
||||
|
||||
def tokenize(self, text, return_str=False):
|
||||
text = str(text) # Converts input string into unicode.
|
||||
for regexp, substitution in self.TOKTOK_REGEXES:
|
||||
text = regexp.sub(substitution, text)
|
||||
# Finally, strips heading and trailing spaces
|
||||
# and converts output string into unicode.
|
||||
text = str(text.strip())
|
||||
return text if return_str else text.split()
|
||||
@@ -0,0 +1,402 @@
|
||||
# Natural Language Toolkit: Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
|
||||
# Tom Aarsen <> (modifications)
|
||||
#
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
|
||||
Penn Treebank Tokenizer
|
||||
|
||||
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
|
||||
This implementation is a port of the tokenizer sed script written by Robert McIntyre
|
||||
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
|
||||
"""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from typing import Iterator, List, Tuple
|
||||
|
||||
from nltk.tokenize.api import TokenizerI
|
||||
from nltk.tokenize.destructive import MacIntyreContractions
|
||||
from nltk.tokenize.util import align_tokens
|
||||
|
||||
|
||||
class TreebankWordTokenizer(TokenizerI):
|
||||
r"""
|
||||
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
|
||||
|
||||
This tokenizer performs the following steps:
|
||||
|
||||
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
|
||||
- treat most punctuation characters as separate tokens
|
||||
- split off commas and single quotes, when followed by whitespace
|
||||
- separate periods that appear at the end of line
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> TreebankWordTokenizer().tokenize(s)
|
||||
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
|
||||
>>> s = "They'll save and invest more."
|
||||
>>> TreebankWordTokenizer().tokenize(s)
|
||||
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
|
||||
>>> s = "hi, my name can't hello,"
|
||||
>>> TreebankWordTokenizer().tokenize(s)
|
||||
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
|
||||
"""
|
||||
|
||||
# starting quotes
|
||||
STARTING_QUOTES = [
|
||||
(re.compile(r"^\""), r"``"),
|
||||
(re.compile(r"(``)"), r" \1 "),
|
||||
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
|
||||
]
|
||||
|
||||
# punctuation
|
||||
PUNCTUATION = [
|
||||
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
|
||||
(re.compile(r"([:,])$"), r" \1 "),
|
||||
(re.compile(r"\.\.\."), r" ... "),
|
||||
(re.compile(r"[;@#$%&]"), r" \g<0> "),
|
||||
(
|
||||
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
|
||||
r"\1 \2\3 ",
|
||||
), # Handles the final period.
|
||||
(re.compile(r"[?!]"), r" \g<0> "),
|
||||
(re.compile(r"([^'])' "), r"\1 ' "),
|
||||
]
|
||||
|
||||
# Pads parentheses
|
||||
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
|
||||
|
||||
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
|
||||
CONVERT_PARENTHESES = [
|
||||
(re.compile(r"\("), "-LRB-"),
|
||||
(re.compile(r"\)"), "-RRB-"),
|
||||
(re.compile(r"\["), "-LSB-"),
|
||||
(re.compile(r"\]"), "-RSB-"),
|
||||
(re.compile(r"\{"), "-LCB-"),
|
||||
(re.compile(r"\}"), "-RCB-"),
|
||||
]
|
||||
|
||||
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
|
||||
|
||||
# ending quotes
|
||||
ENDING_QUOTES = [
|
||||
(re.compile(r"''"), " '' "),
|
||||
(re.compile(r'"'), " '' "),
|
||||
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
|
||||
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
|
||||
]
|
||||
|
||||
# List of contractions adapted from Robert MacIntyre's tokenizer.
|
||||
_contractions = MacIntyreContractions()
|
||||
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
|
||||
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
|
||||
|
||||
def tokenize(
|
||||
self, text: str, convert_parentheses: bool = False, return_str: bool = False
|
||||
) -> List[str]:
|
||||
r"""Return a tokenized copy of `text`.
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
|
||||
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
>>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
|
||||
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
|
||||
'of', 'them.', 'Thanks', '.']
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:param convert_parentheses: if True, replace parentheses to PTB symbols,
|
||||
e.g. `(` to `-LRB-`. Defaults to False.
|
||||
:type convert_parentheses: bool, optional
|
||||
:param return_str: If True, return tokens as space-separated string,
|
||||
defaults to False.
|
||||
:type return_str: bool, optional
|
||||
:return: List of tokens from `text`.
|
||||
:rtype: List[str]
|
||||
"""
|
||||
if return_str is not False:
|
||||
warnings.warn(
|
||||
"Parameter 'return_str' has been deprecated and should no "
|
||||
"longer be used.",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
for regexp, substitution in self.STARTING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp, substitution in self.PUNCTUATION:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles parentheses.
|
||||
regexp, substitution = self.PARENS_BRACKETS
|
||||
text = regexp.sub(substitution, text)
|
||||
# Optionally convert parentheses
|
||||
if convert_parentheses:
|
||||
for regexp, substitution in self.CONVERT_PARENTHESES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Handles double dash.
|
||||
regexp, substitution = self.DOUBLE_DASHES
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
for regexp, substitution in self.ENDING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
for regexp in self.CONTRACTIONS2:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
for regexp in self.CONTRACTIONS3:
|
||||
text = regexp.sub(r" \1 \2 ", text)
|
||||
|
||||
# We are not using CONTRACTIONS4 since
|
||||
# they are also commented out in the SED scripts
|
||||
# for regexp in self._contractions.CONTRACTIONS4:
|
||||
# text = regexp.sub(r' \1 \2 \3 ', text)
|
||||
|
||||
return text.split()
|
||||
|
||||
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
|
||||
r"""
|
||||
Returns the spans of the tokens in ``text``.
|
||||
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
||||
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
||||
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
||||
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
||||
>>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
|
||||
True
|
||||
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
||||
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
||||
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
|
||||
True
|
||||
|
||||
:param text: A string with a sentence or sentences.
|
||||
:type text: str
|
||||
:yield: Tuple[int, int]
|
||||
"""
|
||||
raw_tokens = self.tokenize(text)
|
||||
|
||||
# Convert converted quotes back to original double quotes
|
||||
# Do this only if original text contains double quote(s) or double
|
||||
# single-quotes (because '' might be transformed to `` if it is
|
||||
# treated as starting quotes).
|
||||
if ('"' in text) or ("''" in text):
|
||||
# Find double quotes and converted quotes
|
||||
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
|
||||
|
||||
# Replace converted quotes back to double quotes
|
||||
tokens = [
|
||||
matched.pop(0) if tok in ['"', "``", "''"] else tok
|
||||
for tok in raw_tokens
|
||||
]
|
||||
else:
|
||||
tokens = raw_tokens
|
||||
|
||||
yield from align_tokens(tokens, text)
|
||||
|
||||
|
||||
class TreebankWordDetokenizer(TokenizerI):
|
||||
r"""
|
||||
The Treebank detokenizer uses the reverse regex operations corresponding to
|
||||
the Treebank tokenizer's regexes.
|
||||
|
||||
Note:
|
||||
|
||||
- There're additional assumption mades when undoing the padding of ``[;@#$%&]``
|
||||
punctuation symbols that isn't presupposed in the TreebankTokenizer.
|
||||
- There're additional regexes added in reversing the parentheses tokenization,
|
||||
such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
|
||||
padding added to the closing parentheses precedding ``[:;,.]``.
|
||||
- It's not possible to return the original whitespaces as they were because
|
||||
there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
|
||||
the text.split() operation.
|
||||
|
||||
>>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.'''
|
||||
>>> d = TreebankWordDetokenizer()
|
||||
>>> t = TreebankWordTokenizer()
|
||||
>>> toks = t.tokenize(s)
|
||||
>>> d.detokenize(toks)
|
||||
'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
|
||||
|
||||
The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
|
||||
parameter:
|
||||
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
|
||||
... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
|
||||
>>> expected_tokens == t.tokenize(s, convert_parentheses=True)
|
||||
True
|
||||
>>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
|
||||
>>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
|
||||
True
|
||||
|
||||
During tokenization it's safe to add more spaces but during detokenization,
|
||||
simply undoing the padding doesn't really help.
|
||||
|
||||
- During tokenization, left and right pad is added to ``[!?]``, when
|
||||
detokenizing, only left shift the ``[!?]`` is needed.
|
||||
Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.
|
||||
|
||||
- During tokenization ``[:,]`` are left and right padded but when detokenizing,
|
||||
only left shift is necessary and we keep right pad after comma/colon
|
||||
if the string after is a non-digit.
|
||||
Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.
|
||||
|
||||
>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||
>>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
|
||||
>>> twd = TreebankWordDetokenizer()
|
||||
>>> twd.detokenize(toks)
|
||||
"hello, i can't feel my feet! Help!!"
|
||||
|
||||
>>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
|
||||
... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
|
||||
>>> twd.detokenize(toks)
|
||||
"hello, i can't feel; my feet! Help!! He said: Help, help?!"
|
||||
"""
|
||||
|
||||
_contractions = MacIntyreContractions()
|
||||
CONTRACTIONS2 = [
|
||||
re.compile(pattern.replace("(?#X)", r"\s"))
|
||||
for pattern in _contractions.CONTRACTIONS2
|
||||
]
|
||||
CONTRACTIONS3 = [
|
||||
re.compile(pattern.replace("(?#X)", r"\s"))
|
||||
for pattern in _contractions.CONTRACTIONS3
|
||||
]
|
||||
|
||||
# ending quotes
|
||||
ENDING_QUOTES = [
|
||||
(re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
|
||||
(re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
|
||||
(re.compile(r"(\S)\s(\'\')"), r"\1\2"),
|
||||
(
|
||||
re.compile(r"(\'\')\s([.,:)\]>};%])"),
|
||||
r"\1\2",
|
||||
), # Quotes followed by no-left-padded punctuations.
|
||||
(re.compile(r"''"), '"'),
|
||||
]
|
||||
|
||||
# Handles double dashes
|
||||
DOUBLE_DASHES = (re.compile(r" -- "), r"--")
|
||||
|
||||
# Optionally: Convert parentheses, brackets and converts them from PTB symbols.
|
||||
CONVERT_PARENTHESES = [
|
||||
(re.compile("-LRB-"), "("),
|
||||
(re.compile("-RRB-"), ")"),
|
||||
(re.compile("-LSB-"), "["),
|
||||
(re.compile("-RSB-"), "]"),
|
||||
(re.compile("-LCB-"), "{"),
|
||||
(re.compile("-RCB-"), "}"),
|
||||
]
|
||||
|
||||
# Undo padding on parentheses.
|
||||
PARENS_BRACKETS = [
|
||||
(re.compile(r"([\[\(\{\<])\s"), r"\g<1>"),
|
||||
(re.compile(r"\s([\]\)\}\>])"), r"\g<1>"),
|
||||
(re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
|
||||
]
|
||||
|
||||
# punctuation
|
||||
PUNCTUATION = [
|
||||
(re.compile(r"([^'])\s'\s"), r"\1' "),
|
||||
(re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!]
|
||||
# (re.compile(r'\s([?!])\s'), r'\g<1>'),
|
||||
(re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
|
||||
# When tokenizing, [;@#$%&] are padded with whitespace regardless of
|
||||
# whether there are spaces before or after them.
|
||||
# But during detokenization, we need to distinguish between left/right
|
||||
# pad, so we split this up.
|
||||
(re.compile(r"([#$])\s"), r"\g<1>"), # Left pad.
|
||||
(re.compile(r"\s([;%])"), r"\g<1>"), # Right pad.
|
||||
# (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad.
|
||||
(re.compile(r"\s\.\.\.\s"), r"..."),
|
||||
# (re.compile(r"\s([:,])\s$"), r"\1"), # .strip() takes care of it.
|
||||
(
|
||||
re.compile(r"\s([:,])"),
|
||||
r"\1",
|
||||
), # Just remove left padding. Punctuation in numbers won't be padded.
|
||||
]
|
||||
|
||||
# starting quotes
|
||||
STARTING_QUOTES = [
|
||||
(re.compile(r"([ (\[{<])\s``"), r"\1``"),
|
||||
(re.compile(r"(``)\s"), r"\1"),
|
||||
(re.compile(r"``"), r'"'),
|
||||
]
|
||||
|
||||
def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
|
||||
"""
|
||||
Treebank detokenizer, created by undoing the regexes from
|
||||
the TreebankWordTokenizer.tokenize.
|
||||
|
||||
:param tokens: A list of strings, i.e. tokenized text.
|
||||
:type tokens: List[str]
|
||||
:param convert_parentheses: if True, replace PTB symbols with parentheses,
|
||||
e.g. `-LRB-` to `(`. Defaults to False.
|
||||
:type convert_parentheses: bool, optional
|
||||
:return: str
|
||||
"""
|
||||
text = " ".join(tokens)
|
||||
|
||||
# Add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
# Reverse the contractions regexes.
|
||||
# Note: CONTRACTIONS4 are not used in tokenization.
|
||||
for regexp in self.CONTRACTIONS3:
|
||||
text = regexp.sub(r"\1\2", text)
|
||||
for regexp in self.CONTRACTIONS2:
|
||||
text = regexp.sub(r"\1\2", text)
|
||||
|
||||
# Reverse the regexes applied for ending quotes.
|
||||
for regexp, substitution in self.ENDING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Undo the space padding.
|
||||
text = text.strip()
|
||||
|
||||
# Reverse the padding on double dashes.
|
||||
regexp, substitution = self.DOUBLE_DASHES
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
if convert_parentheses:
|
||||
for regexp, substitution in self.CONVERT_PARENTHESES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Reverse the padding regexes applied for parenthesis/brackets.
|
||||
for regexp, substitution in self.PARENS_BRACKETS:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Reverse the regexes applied for punctuations.
|
||||
for regexp, substitution in self.PUNCTUATION:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
# Reverse the regexes applied for starting quotes.
|
||||
for regexp, substitution in self.STARTING_QUOTES:
|
||||
text = regexp.sub(substitution, text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
|
||||
"""Duck-typing the abstract *tokenize()*."""
|
||||
return self.tokenize(tokens, convert_parentheses)
|
||||
295
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/util.py
Normal file
295
Backend/venv/lib/python3.12/site-packages/nltk/tokenize/util.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# Natural Language Toolkit: Tokenizer Utilities
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
from re import finditer
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
|
||||
def string_span_tokenize(s, sep):
|
||||
r"""
|
||||
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
||||
tuples, by splitting the string at each occurrence of *sep*.
|
||||
|
||||
>>> from nltk.tokenize.util import string_span_tokenize
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
|
||||
(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
|
||||
|
||||
:param s: the string to be tokenized
|
||||
:type s: str
|
||||
:param sep: the token separator
|
||||
:type sep: str
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
if len(sep) == 0:
|
||||
raise ValueError("Token delimiter must not be empty")
|
||||
left = 0
|
||||
while True:
|
||||
try:
|
||||
right = s.index(sep, left)
|
||||
if right != 0:
|
||||
yield left, right
|
||||
except ValueError:
|
||||
if left != len(s):
|
||||
yield left, len(s)
|
||||
break
|
||||
|
||||
left = right + len(sep)
|
||||
|
||||
|
||||
def regexp_span_tokenize(s, regexp):
|
||||
r"""
|
||||
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
|
||||
tuples, by splitting the string at each successive match of *regexp*.
|
||||
|
||||
>>> from nltk.tokenize.util import regexp_span_tokenize
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
|
||||
(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
|
||||
|
||||
:param s: the string to be tokenized
|
||||
:type s: str
|
||||
:param regexp: regular expression that matches token separators (must not be empty)
|
||||
:type regexp: str
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
left = 0
|
||||
for m in finditer(regexp, s):
|
||||
right, next = m.span()
|
||||
if right != left:
|
||||
yield left, right
|
||||
left = next
|
||||
yield left, len(s)
|
||||
|
||||
|
||||
def spans_to_relative(spans):
|
||||
r"""
|
||||
Return a sequence of relative spans, given a sequence of spans.
|
||||
|
||||
>>> from nltk.tokenize import WhitespaceTokenizer
|
||||
>>> from nltk.tokenize.util import spans_to_relative
|
||||
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
|
||||
... two of them.\n\nThanks.'''
|
||||
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE
|
||||
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
|
||||
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
|
||||
|
||||
:param spans: a sequence of (start, end) offsets of the tokens
|
||||
:type spans: iter(tuple(int, int))
|
||||
:rtype: iter(tuple(int, int))
|
||||
"""
|
||||
prev = 0
|
||||
for left, right in spans:
|
||||
yield left - prev, right - left
|
||||
prev = right
|
||||
|
||||
|
||||
class CJKChars:
|
||||
"""
|
||||
An object that enumerates the code points of the CJK characters as listed on
|
||||
https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
|
||||
|
||||
This is a Python port of the CJK code point enumerations of Moses tokenizer:
|
||||
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
|
||||
"""
|
||||
|
||||
# Hangul Jamo (1100–11FF)
|
||||
Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
|
||||
|
||||
# CJK Radicals Supplement (2E80–2EFF)
|
||||
# Kangxi Radicals (2F00–2FDF)
|
||||
# Ideographic Description Characters (2FF0–2FFF)
|
||||
# CJK Symbols and Punctuation (3000–303F)
|
||||
# Hiragana (3040–309F)
|
||||
# Katakana (30A0–30FF)
|
||||
# Bopomofo (3100–312F)
|
||||
# Hangul Compatibility Jamo (3130–318F)
|
||||
# Kanbun (3190–319F)
|
||||
# Bopomofo Extended (31A0–31BF)
|
||||
# CJK Strokes (31C0–31EF)
|
||||
# Katakana Phonetic Extensions (31F0–31FF)
|
||||
# Enclosed CJK Letters and Months (3200–32FF)
|
||||
# CJK Compatibility (3300–33FF)
|
||||
# CJK Unified Ideographs Extension A (3400–4DBF)
|
||||
# Yijing Hexagram Symbols (4DC0–4DFF)
|
||||
# CJK Unified Ideographs (4E00–9FFF)
|
||||
# Yi Syllables (A000–A48F)
|
||||
# Yi Radicals (A490–A4CF)
|
||||
CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
|
||||
|
||||
# Phags-pa (A840–A87F)
|
||||
Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
|
||||
|
||||
# Hangul Syllables (AC00–D7AF)
|
||||
Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
|
||||
|
||||
# CJK Compatibility Ideographs (F900–FAFF)
|
||||
CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
|
||||
|
||||
# CJK Compatibility Forms (FE30–FE4F)
|
||||
CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
|
||||
|
||||
# Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
|
||||
Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
|
||||
|
||||
# Supplementary Ideographic Plane 20000–2FFFF
|
||||
Supplementary_Ideographic_Plane = (
|
||||
131072,
|
||||
196607,
|
||||
) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
|
||||
|
||||
ranges = [
|
||||
Hangul_Jamo,
|
||||
CJK_Radicals,
|
||||
Phags_Pa,
|
||||
Hangul_Syllables,
|
||||
CJK_Compatibility_Ideographs,
|
||||
CJK_Compatibility_Forms,
|
||||
Katakana_Hangul_Halfwidth,
|
||||
Supplementary_Ideographic_Plane,
|
||||
]
|
||||
|
||||
|
||||
def is_cjk(character):
|
||||
"""
|
||||
Python port of Moses' code to check for CJK character.
|
||||
|
||||
>>> CJKChars().ranges
|
||||
[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
|
||||
>>> is_cjk(u'\u33fe')
|
||||
True
|
||||
>>> is_cjk(u'\uFE5F')
|
||||
False
|
||||
|
||||
:param character: The character that needs to be checked.
|
||||
:type character: char
|
||||
:return: bool
|
||||
"""
|
||||
return any(
|
||||
[
|
||||
start <= ord(character) <= end
|
||||
for start, end in [
|
||||
(4352, 4607),
|
||||
(11904, 42191),
|
||||
(43072, 43135),
|
||||
(44032, 55215),
|
||||
(63744, 64255),
|
||||
(65072, 65103),
|
||||
(65381, 65500),
|
||||
(131072, 196607),
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def xml_escape(text):
|
||||
"""
|
||||
This function transforms the input text into an "escaped" version suitable
|
||||
for well-formed XML formatting.
|
||||
|
||||
Note that the default xml.sax.saxutils.escape() function don't escape
|
||||
some characters that Moses does so we have to manually add them to the
|
||||
entities dictionary.
|
||||
|
||||
>>> input_str = ''')| & < > ' " ] ['''
|
||||
>>> expected_output = ''')| & < > ' " ] ['''
|
||||
>>> escape(input_str) == expected_output
|
||||
True
|
||||
>>> xml_escape(input_str)
|
||||
')| & < > ' " ] ['
|
||||
|
||||
:param text: The text that needs to be escaped.
|
||||
:type text: str
|
||||
:rtype: str
|
||||
"""
|
||||
return escape(
|
||||
text,
|
||||
entities={
|
||||
r"'": r"'",
|
||||
r'"': r""",
|
||||
r"|": r"|",
|
||||
r"[": r"[",
|
||||
r"]": r"]",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def xml_unescape(text):
|
||||
"""
|
||||
This function transforms the "escaped" version suitable
|
||||
for well-formed XML formatting into humanly-readable string.
|
||||
|
||||
Note that the default xml.sax.saxutils.unescape() function don't unescape
|
||||
some characters that Moses does so we have to manually add them to the
|
||||
entities dictionary.
|
||||
|
||||
>>> from xml.sax.saxutils import unescape
|
||||
>>> s = ')| & < > ' " ] ['
|
||||
>>> expected = ''')| & < > \' " ] ['''
|
||||
>>> xml_unescape(s) == expected
|
||||
True
|
||||
|
||||
:param text: The text that needs to be unescaped.
|
||||
:type text: str
|
||||
:rtype: str
|
||||
"""
|
||||
return unescape(
|
||||
text,
|
||||
entities={
|
||||
r"'": r"'",
|
||||
r""": r'"',
|
||||
r"|": r"|",
|
||||
r"[": r"[",
|
||||
r"]": r"]",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def align_tokens(tokens, sentence):
|
||||
"""
|
||||
This module attempt to find the offsets of the tokens in *s*, as a sequence
|
||||
of ``(start, end)`` tuples, given the tokens and also the source string.
|
||||
|
||||
>>> from nltk.tokenize import TreebankWordTokenizer
|
||||
>>> from nltk.tokenize.util import align_tokens
|
||||
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
|
||||
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
|
||||
... "on Saturday.")
|
||||
>>> tokens = TreebankWordTokenizer().tokenize(s)
|
||||
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
|
||||
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
|
||||
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
|
||||
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
|
||||
... (123, 131), (131, 132)]
|
||||
>>> output = list(align_tokens(tokens, s))
|
||||
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
|
||||
True
|
||||
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
|
||||
True
|
||||
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
|
||||
True
|
||||
|
||||
:param tokens: The list of strings that are the result of tokenization
|
||||
:type tokens: list(str)
|
||||
:param sentence: The original string
|
||||
:type sentence: str
|
||||
:rtype: list(tuple(int,int))
|
||||
"""
|
||||
point = 0
|
||||
offsets = []
|
||||
for token in tokens:
|
||||
try:
|
||||
start = sentence.index(token, point)
|
||||
except ValueError as e:
|
||||
raise ValueError(f'substring "{token}" not found in "{sentence}"') from e
|
||||
point = start + len(token)
|
||||
offsets.append((start, point))
|
||||
return offsets
|
||||
Reference in New Issue
Block a user