This commit is contained in:
Iliyan Angelov
2025-12-01 06:50:10 +02:00
parent 91f51bc6fe
commit 62c1fe5951
4682 changed files with 544807 additions and 31208 deletions

View File

@@ -0,0 +1,145 @@
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com> (minor additions)
# Contributors: matthewmc, clouds56
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
r"""
NLTK Tokenizer Package
Tokenizers divide strings into lists of substrings. For example,
tokenizers can be used to find the words and punctuation in a string:
>>> from nltk.tokenize import word_tokenize
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
... two of them.\n\nThanks.'''
>>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:
>>> from nltk.tokenize import wordpunct_tokenize
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:
>>> from nltk.tokenize import sent_tokenize, word_tokenize
>>> sent_tokenize(s)
['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
>>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
[['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]
Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.
NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers. (These methods are implemented as generators.)
>>> from nltk.tokenize import WhitespaceTokenizer
>>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
(45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
There are numerous ways to tokenize text. If you need more control over
tokenization, see the other methods provided in this package.
For further information, please see Chapter 3 of the NLTK book.
"""
import functools
import re
from nltk.data import load
from nltk.tokenize.casual import TweetTokenizer, casual_tokenize
from nltk.tokenize.destructive import NLTKWordTokenizer
from nltk.tokenize.legality_principle import LegalitySyllableTokenizer
from nltk.tokenize.mwe import MWETokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTokenizer
from nltk.tokenize.regexp import (
BlanklineTokenizer,
RegexpTokenizer,
WhitespaceTokenizer,
WordPunctTokenizer,
blankline_tokenize,
regexp_tokenize,
wordpunct_tokenize,
)
from nltk.tokenize.repp import ReppTokenizer
from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize
from nltk.tokenize.simple import (
LineTokenizer,
SpaceTokenizer,
TabTokenizer,
line_tokenize,
)
from nltk.tokenize.sonority_sequencing import SyllableTokenizer
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize.texttiling import TextTilingTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
@functools.lru_cache
def _get_punkt_tokenizer(language="english"):
"""
A constructor for the PunktTokenizer that utilizes
a lru cache for performance.
:param language: the model name in the Punkt corpus
:type language: str
"""
return PunktTokenizer(language)
# Standard sentence tokenizer.
def sent_tokenize(text, language="english"):
"""
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
(currently :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into sentences
:param language: the model name in the Punkt corpus
"""
tokenizer = _get_punkt_tokenizer(language)
return tokenizer.tokenize(text)
# Standard word tokenizer.
_treebank_word_tokenizer = NLTKWordTokenizer()
def word_tokenize(text, language="english", preserve_line=False):
"""
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently an improved :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).
:param text: text to split into words
:type text: str
:param language: the model name in the Punkt corpus
:type language: str
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
:type preserve_line: bool
"""
sentences = [text] if preserve_line else sent_tokenize(text, language)
return [
token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
]

View File

@@ -0,0 +1,83 @@
# Natural Language Toolkit: Tokenizer Interface
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Tokenizer Interface
"""
from abc import ABC, abstractmethod
from typing import Iterator, List, Tuple
from nltk.internals import overridden
from nltk.tokenize.util import string_span_tokenize
class TokenizerI(ABC):
"""
A processing interface for tokenizing a string.
Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both).
"""
@abstractmethod
def tokenize(self, s: str) -> List[str]:
"""
Return a tokenized copy of *s*.
:rtype: List[str]
"""
if overridden(self.tokenize_sents):
return self.tokenize_sents([s])[0]
def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]:
"""
Identify the tokens using integer offsets ``(start_i, end_i)``,
where ``s[start_i:end_i]`` is the corresponding token.
:rtype: Iterator[Tuple[int, int]]
"""
raise NotImplementedError()
def tokenize_sents(self, strings: List[str]) -> List[List[str]]:
"""
Apply ``self.tokenize()`` to each element of ``strings``. I.e.:
return [self.tokenize(s) for s in strings]
:rtype: List[List[str]]
"""
return [self.tokenize(s) for s in strings]
def span_tokenize_sents(
self, strings: List[str]
) -> Iterator[List[Tuple[int, int]]]:
"""
Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.:
return [self.span_tokenize(s) for s in strings]
:yield: List[Tuple[int, int]]
"""
for s in strings:
yield list(self.span_tokenize(s))
class StringTokenizer(TokenizerI):
"""A tokenizer that divides a string into substrings by splitting
on the specified string (defined in subclasses).
"""
@property
@abstractmethod
def _string(self):
raise NotImplementedError
def tokenize(self, s):
return s.split(self._string)
def span_tokenize(self, s):
yield from string_span_tokenize(s, self._string)

View File

@@ -0,0 +1,458 @@
#
# Natural Language Toolkit: Twitter Tokenizer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
# Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
# Pierpaolo Pantone <> (modifications)
# Tom Aarsen <> (modifications)
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
"""
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
domains and tasks. The basic logic is this:
1. The tuple REGEXPS defines a list of regular expression
strings.
2. The REGEXPS strings are put, in order, into a compiled
regular expression object called WORD_RE, under the TweetTokenizer
class.
3. The tokenization is done by WORD_RE.findall(s), where s is the
user-supplied string, inside the tokenize() method of the class
TweetTokenizer.
4. When instantiating Tokenizer objects, there are several options:
* preserve_case. By default, it is set to True. If it is set to
False, then the tokenizer will downcase everything except for
emoticons.
* reduce_len. By default, it is set to False. It specifies whether
to replace repeated character sequences of length 3 or greater
with sequences of length 3.
* strip_handles. By default, it is set to False. It specifies
whether to remove Twitter handles of text used in the
`tokenize` method.
* match_phone_numbers. By default, it is set to True. It indicates
whether the `tokenize` method should look for phone numbers.
"""
######################################################################
import html
from typing import List
import regex # https://github.com/nltk/nltk/issues/2409
from nltk.tokenize.api import TokenizerI
######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
# <:| and some text >:)
#
# Most importantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.
# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ?
# This particular element is used in a couple ways, so we define it
# with a name:
EMOTICONS = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
|
</?3 # heart
)"""
# URL pattern due to John Gruber, modified by Tom Winzig. See
# https://gist.github.com/winzig/8894715
URLS = r""" # Capture 1: entire matched URL
(?:
https?: # URL protocol and colon
(?:
/{1,3} # 1-3 slashes
| # or
[a-z0-9%] # Single letter or digit or '%'
# (Trying not to match e.g. "URI::Escape")
)
| # or
# looks like domain name followed by a slash:
[a-z0-9.\-]+[.]
(?:[a-z]{2,13})
/
)
(?: # One or more:
[^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[]
| # or
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
\([^\s]+?\) # balanced parens, non-recursive: (...)
)+
(?: # End with:
\([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
|
\([^\s]+?\) # balanced parens, non-recursive: (...)
| # or
[^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars
)
| # OR, the following to match naked domains:
(?:
(?<!@) # not preceded by a @, avoid matching foo@_gmail.com_
[a-z0-9]+
(?:[.\-][a-z0-9]+)*
[.]
(?:[a-z]{2,13})
\b
/?
(?!@) # not succeeded by a @,
# avoid matching "foo.na" in "foo.na@example.com"
)
"""
# emoji flag sequence
# https://en.wikipedia.org/wiki/Regional_indicator_symbol
# For regex simplicity, include all possible enclosed letter pairs,
# not the ISO subset of two-letter regional indicator symbols.
# See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes
# Future regional flag support may be handled with the regex for
# U+1F3F4 🏴 followed by emoji tag sequences:
# r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F'
FLAGS = r"""
(?:
[\U0001F1E6-\U0001F1FF]{2} # all enclosed letter pairs
|
# English flag
\U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
|
# Scottish flag
\U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
|
# For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
\U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
)
"""
# Regex for recognizing phone numbers:
PHONE_REGEX = r"""
(?:
(?: # (international)
\+?[01]
[ *\-.\)]*
)?
(?: # (area code)
[\(]?
\d{3}
[ *\-.\)]*
)?
\d{3} # exchange
[ *\-.\)]*
\d{4} # base
)"""
# The components of the tokenizer:
REGEXPS = (
URLS,
# ASCII Emoticons
EMOTICONS,
# HTML tags:
r"""<[^>\s]+>""",
# ASCII Arrows
r"""[\-]+>|<[\-]+""",
# Twitter username:
r"""(?:@[\w_]+)""",
# Twitter hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",
# email addresses
r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",
# Zero-Width-Joiner and Skin tone modifier emojis
""".(?:
[\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+
|
[\U0001F3FB-\U0001F3FF]
)""",
# flags
FLAGS,
# Remaining word types:
r"""
(?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
|
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
""",
)
# Take the main components and add a phone regex as the second parameter
REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:])
######################################################################
# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent
# the core tokenizing regexes. They are compiled lazily.
# WORD_RE performs poorly on these patterns:
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")
# The emoticon string gets its own regex so that we can preserve case for
# them as needed:
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)
# These are for regularizing HTML entities to Unicode:
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
# For stripping away handles from a tweet:
HANDLES_RE = regex.compile(
r"(?<![A-Za-z0-9_!@#\$%&*])@"
r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))"
)
######################################################################
# Functions for converting html entities
######################################################################
def _str_to_unicode(text, encoding=None, errors="strict"):
if encoding is None:
encoding = "utf-8"
if isinstance(text, bytes):
return text.decode(encoding, errors)
return text
def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"):
"""
Remove entities from text by converting them to their
corresponding unicode character.
:param text: a unicode string or a byte string encoded in the given
`encoding` (which defaults to 'utf-8').
:param list keep: list of entity names which should not be replaced.\
This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
and named entities (such as ``&nbsp;`` or ``&gt;``).
:param bool remove_illegal: If `True`, entities that can't be converted are\
removed. Otherwise, entities that can't be converted are kept "as
is".
:returns: A unicode string with the entities removed.
See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py
>>> from nltk.tokenize.casual import _replace_html_entities
>>> _replace_html_entities(b'Price: &pound;100')
'Price: \\xa3100'
>>> print(_replace_html_entities(b'Price: &pound;100'))
Price: £100
>>>
"""
def _convert_entity(match):
entity_body = match.group(3)
if match.group(1):
try:
if match.group(2):
number = int(entity_body, 16)
else:
number = int(entity_body, 10)
# Numeric character references in the 80-9F range are typically
# interpreted by browsers as representing the characters mapped
# to bytes 80-9F in the Windows-1252 encoding. For more info
# see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets
if 0x80 <= number <= 0x9F:
return bytes((number,)).decode("cp1252")
except ValueError:
number = None
else:
if entity_body in keep:
return match.group(0)
number = html.entities.name2codepoint.get(entity_body)
if number is not None:
try:
return chr(number)
except (ValueError, OverflowError):
pass
return "" if remove_illegal else match.group(0)
return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding))
######################################################################
class TweetTokenizer(TokenizerI):
r"""
Tokenizer for tweets.
>>> from nltk.tokenize import TweetTokenizer
>>> tknzr = TweetTokenizer()
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
>>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
'<--']
Examples using `strip_handles` and `reduce_len parameters`:
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
>>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
>>> tknzr.tokenize(s1)
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
"""
# Values used to lazily compile WORD_RE and PHONE_WORD_RE,
# which are the core tokenizing regexes.
_WORD_RE = None
_PHONE_WORD_RE = None
######################################################################
def __init__(
self,
preserve_case=True,
reduce_len=False,
strip_handles=False,
match_phone_numbers=True,
):
"""
Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.
:param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
of text used in the `tokenize` method. Defaults to True.
:type preserve_case: bool
:param reduce_len: Flag indicating whether to replace repeated character sequences
of length 3 or greater with sequences of length 3. Defaults to False.
:type reduce_len: bool
:param strip_handles: Flag indicating whether to remove Twitter handles of text used
in the `tokenize` method. Defaults to False.
:type strip_handles: bool
:param match_phone_numbers: Flag indicating whether the `tokenize` method should look
for phone numbers. Defaults to True.
:type match_phone_numbers: bool
"""
self.preserve_case = preserve_case
self.reduce_len = reduce_len
self.strip_handles = strip_handles
self.match_phone_numbers = match_phone_numbers
def tokenize(self, text: str) -> List[str]:
"""Tokenize the input text.
:param text: str
:rtype: list(str)
:return: a tokenized list of strings; joining this list returns\
the original string if `preserve_case=False`.
"""
# Fix HTML character entities:
text = _replace_html_entities(text)
# Remove username handles
if self.strip_handles:
text = remove_handles(text)
# Normalize word lengthening
if self.reduce_len:
text = reduce_lengthening(text)
# Shorten problematic sequences of characters
safe_text = HANG_RE.sub(r"\1\1\1", text)
# Recognise phone numbers during tokenization
if self.match_phone_numbers:
words = self.PHONE_WORD_RE.findall(safe_text)
else:
words = self.WORD_RE.findall(safe_text)
# Possibly alter the case, but avoid changing emoticons like :D into :d:
if not self.preserve_case:
words = list(
map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
)
return words
@property
def WORD_RE(self) -> "regex.Pattern":
"""Core TweetTokenizer regex"""
# Compiles the regex for this and all future instantiations of TweetTokenizer.
if not type(self)._WORD_RE:
type(self)._WORD_RE = regex.compile(
f"({'|'.join(REGEXPS)})",
regex.VERBOSE | regex.I | regex.UNICODE,
)
return type(self)._WORD_RE
@property
def PHONE_WORD_RE(self) -> "regex.Pattern":
"""Secondary core TweetTokenizer regex"""
# Compiles the regex for this and all future instantiations of TweetTokenizer.
if not type(self)._PHONE_WORD_RE:
type(self)._PHONE_WORD_RE = regex.compile(
f"({'|'.join(REGEXPS_PHONE)})",
regex.VERBOSE | regex.I | regex.UNICODE,
)
return type(self)._PHONE_WORD_RE
######################################################################
# Normalization Functions
######################################################################
def reduce_lengthening(text):
"""
Replace repeated character sequences of length 3 or greater with sequences
of length 3.
"""
pattern = regex.compile(r"(.)\1{2,}")
return pattern.sub(r"\1\1\1", text)
def remove_handles(text):
"""
Remove Twitter username handles from text.
"""
# Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly
return HANDLES_RE.sub(" ", text)
######################################################################
# Tokenization Function
######################################################################
def casual_tokenize(
text,
preserve_case=True,
reduce_len=False,
strip_handles=False,
match_phone_numbers=True,
):
"""
Convenience function for wrapping the tokenizer.
"""
return TweetTokenizer(
preserve_case=preserve_case,
reduce_len=reduce_len,
strip_handles=strip_handles,
match_phone_numbers=match_phone_numbers,
).tokenize(text)
###############################################################################

View File

@@ -0,0 +1,234 @@
# Natural Language Toolkit: NLTK's very own tokenizer.
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Liling Tan
# Tom Aarsen <> (modifications)
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
import re
import warnings
from typing import Iterator, List, Tuple
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import align_tokens
class MacIntyreContractions:
"""
List of contractions adapted from Robert MacIntyre's tokenizer.
"""
CONTRACTIONS2 = [
r"(?i)\b(can)(?#X)(not)\b",
r"(?i)\b(d)(?#X)('ye)\b",
r"(?i)\b(gim)(?#X)(me)\b",
r"(?i)\b(gon)(?#X)(na)\b",
r"(?i)\b(got)(?#X)(ta)\b",
r"(?i)\b(lem)(?#X)(me)\b",
r"(?i)\b(more)(?#X)('n)\b",
r"(?i)\b(wan)(?#X)(na)(?=\s)",
]
CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"]
CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"]
class NLTKWordTokenizer(TokenizerI):
"""
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.
This is the method that is invoked by ``word_tokenize()``. It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.
The tokenizer is "destructive" such that the regexes applied will munge the
input string to a state beyond re-construction. It is possible to apply
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
revert to the original string.
"""
# Starting quotes.
STARTING_QUOTES = [
(re.compile("([«“‘„]|[`]+)", re.U), r" \1 "),
(re.compile(r"^\""), r"``"),
(re.compile(r"(``)"), r" \1 "),
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
(re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"),
]
# Ending quotes.
ENDING_QUOTES = [
(re.compile("([»”’])", re.U), r" \1 "),
(re.compile(r"''"), " '' "),
(re.compile(r'"'), " '' "),
(re.compile(r"\s+"), " "),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
# For improvements for starting/closing quotes from TreebankWordTokenizer,
# see discussion on https://github.com/nltk/nltk/pull/1437
# Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on
# - chevron quotes u'\xab' and u'\xbb'
# - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
# See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608
# Also, behavior of splitting on clitics now follows Stanford CoreNLP
# - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b
# Punctuation.
PUNCTUATION = [
(re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "),
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
(re.compile(r"([:,])$"), r" \1 "),
(
re.compile(r"\.{2,}", re.U),
r" \g<0> ",
), # See https://github.com/nltk/nltk/pull/2322
(re.compile(r"[;@#$%&]"), r" \g<0> "),
(
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
r"\1 \2\3 ",
), # Handles the final period.
(re.compile(r"[?!]"), r" \g<0> "),
(re.compile(r"([^'])' "), r"\1 ' "),
(
re.compile(r"[*]", re.U),
r" \g<0> ",
), # See https://github.com/nltk/nltk/pull/2322
]
# Pads parentheses
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
CONVERT_PARENTHESES = [
(re.compile(r"\("), "-LRB-"),
(re.compile(r"\)"), "-RRB-"),
(re.compile(r"\["), "-LSB-"),
(re.compile(r"\]"), "-RSB-"),
(re.compile(r"\{"), "-LCB-"),
(re.compile(r"\}"), "-RCB-"),
]
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
# List of contractions adapted from Robert MacIntyre's tokenizer.
_contractions = MacIntyreContractions()
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
def tokenize(
self, text: str, convert_parentheses: bool = False, return_str: bool = False
) -> List[str]:
r"""Return a tokenized copy of `text`.
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
"""
if return_str:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
"longer be used.",
category=DeprecationWarning,
stacklevel=2,
)
for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)
for regexp, substitution in self.PUNCTUATION:
text = regexp.sub(substitution, text)
# Handles parentheses.
regexp, substitution = self.PARENS_BRACKETS
text = regexp.sub(substitution, text)
# Optionally convert parentheses
if convert_parentheses:
for regexp, substitution in self.CONVERT_PARENTHESES:
text = regexp.sub(substitution, text)
# Handles double dash.
regexp, substitution = self.DOUBLE_DASHES
text = regexp.sub(substitution, text)
# add extra space to make things easier
text = " " + text + " "
for regexp, substitution in self.ENDING_QUOTES:
text = regexp.sub(substitution, text)
for regexp in self.CONTRACTIONS2:
text = regexp.sub(r" \1 \2 ", text)
for regexp in self.CONTRACTIONS3:
text = regexp.sub(r" \1 \2 ", text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
# for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
return text.split()
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
r"""
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
True
:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
"""
raw_tokens = self.tokenize(text)
# Convert converted quotes back to original double quotes
# Do this only if original text contains double quote(s) or double
# single-quotes (because '' might be transformed to `` if it is
# treated as starting quotes).
if ('"' in text) or ("''" in text):
# Find double quotes and converted quotes
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
# Replace converted quotes back to double quotes
tokens = [
matched.pop(0) if tok in ['"', "``", "''"] else tok
for tok in raw_tokens
]
else:
tokens = raw_tokens
yield from align_tokens(tokens, text)

View File

@@ -0,0 +1,147 @@
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Christopher Hench <chris.l.hench@gmail.com>
# Alex Estes
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
The Legality Principle is a language agnostic principle maintaining that syllable
onsets and codas (the beginning and ends of syllables not including the vowel)
are only legal if they are found as word onsets or codas in the language. The English
word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found
word-initially in the English language (Bartlett et al.). This principle was first proposed
in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''.
Kahn further argues that there is a ''strong tendency to syllabify in such a way that
initial clusters are of maximal length, consistent with the general constraints on
word-initial consonant clusters.'' Consequently, in addition to being legal onsets,
the longest legal onset is preferable---''Onset Maximization''.
The default implementation assumes an English vowel set, but the `vowels` attribute
can be set to IPA or any other alphabet's vowel set for the use-case.
Both a valid set of vowels as well as a text corpus of words in the language
are necessary to determine legal onsets and subsequently syllabify words.
The legality principle with onset maximization is a universal syllabification algorithm,
but that does not mean it performs equally across languages. Bartlett et al. (2009)
is a good benchmark for English accuracy if utilizing IPA (pg. 311).
References:
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11.
- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976).
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
Cambridge, MIT Press. pp. 107-136.
- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409436.
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
In HLT-NAACL. pp. 308-316.
- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley.
"""
from collections import Counter
from nltk.tokenize.api import TokenizerI
class LegalitySyllableTokenizer(TokenizerI):
"""
Syllabifies words based on the Legality Principle and Onset Maximization.
>>> from nltk.tokenize import LegalitySyllableTokenizer
>>> from nltk import word_tokenize
>>> from nltk.corpus import words
>>> text = "This is a wonderful sentence."
>>> text_words = word_tokenize(text)
>>> LP = LegalitySyllableTokenizer(words.words())
>>> [LP.tokenize(word) for word in text_words]
[['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']]
"""
def __init__(
self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001
):
"""
:param tokenized_source_text: List of valid tokens in the language
:type tokenized_source_text: list(str)
:param vowels: Valid vowels in language or IPA representation
:type vowels: str
:param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset
:type legal_frequency_threshold: float
"""
self.legal_frequency_threshold = legal_frequency_threshold
self.vowels = vowels
self.legal_onsets = self.find_legal_onsets(tokenized_source_text)
def find_legal_onsets(self, words):
"""
Gathers all onsets and then return only those above the frequency threshold
:param words: List of words in a language
:type words: list(str)
:return: Set of legal onsets
:rtype: set(str)
"""
onsets = [self.onset(word) for word in words]
legal_onsets = [
k
for k, v in Counter(onsets).items()
if (v / len(onsets)) > self.legal_frequency_threshold
]
return set(legal_onsets)
def onset(self, word):
"""
Returns consonant cluster of word, i.e. all characters until the first vowel.
:param word: Single word or token
:type word: str
:return: String of characters of onset
:rtype: str
"""
onset = ""
for c in word.lower():
if c in self.vowels:
return onset
else:
onset += c
return onset
def tokenize(self, token):
"""
Apply the Legality Principle in combination with
Onset Maximization to return a list of syllables.
:param token: Single word or token
:type token: str
:return syllable_list: Single word or token broken up into syllables.
:rtype: list(str)
"""
syllables = []
syllable, current_onset = "", ""
vowel, onset = False, False
for char in token[::-1]:
char_lower = char.lower()
if not vowel:
syllable += char
vowel = bool(char_lower in self.vowels)
else:
if char_lower + current_onset[::-1] in self.legal_onsets:
syllable += char
current_onset += char_lower
onset = True
elif char_lower in self.vowels and not onset:
syllable += char
current_onset += char_lower
else:
syllables.append(syllable)
syllable = char
current_onset = ""
vowel = bool(char_lower in self.vowels)
syllables.append(syllable)
syllables_ordered = [syllable[::-1] for syllable in syllables][::-1]
return syllables_ordered

View File

@@ -0,0 +1,124 @@
# Multi-Word Expression tokenizer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Rob Malouf <rmalouf@mail.sdsu.edu>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
Multi-Word Expression Tokenizer
A ``MWETokenizer`` takes a string which has already been divided into tokens and
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
of MWEs:
>>> from nltk.tokenize import MWETokenizer
>>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
>>> tokenizer.add_mwe(('in', 'spite', 'of'))
>>> tokenizer.tokenize('Testing testing testing one two three'.split())
['Testing', 'testing', 'testing', 'one', 'two', 'three']
>>> tokenizer.tokenize('This is a test in spite'.split())
['This', 'is', 'a', 'test', 'in', 'spite']
>>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']
"""
from nltk.tokenize.api import TokenizerI
from nltk.util import Trie
class MWETokenizer(TokenizerI):
"""A tokenizer that processes tokenized text and merges multi-word expressions
into single tokens.
"""
def __init__(self, mwes=None, separator="_"):
"""Initialize the multi-word tokenizer with a list of expressions and a
separator
:type mwes: list(list(str))
:param mwes: A sequence of multi-word expressions to be merged, where
each MWE is a sequence of strings.
:type separator: str
:param separator: String that should be inserted between words in a multi-word
expression token. (Default is '_')
"""
if not mwes:
mwes = []
self._mwes = Trie(mwes)
self._separator = separator
def add_mwe(self, mwe):
"""Add a multi-word expression to the lexicon (stored as a word trie)
We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
The key True marks the end of a valid MWE.
:param mwe: The multi-word expression we're adding into the word trie
:type mwe: tuple(str) or list(str)
:Example:
>>> tokenizer = MWETokenizer()
>>> tokenizer.add_mwe(('a', 'b'))
>>> tokenizer.add_mwe(('a', 'b', 'c'))
>>> tokenizer.add_mwe(('a', 'x'))
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
>>> tokenizer._mwes == expected
True
"""
self._mwes.insert(mwe)
def tokenize(self, text):
"""
:param text: A list containing tokenized text
:type text: list(str)
:return: A list of the tokenized text with multi-words merged together
:rtype: list(str)
:Example:
>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
>>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
['An', "hors+d'oeuvre", 'tonight,', 'sir?']
"""
i = 0
n = len(text)
result = []
while i < n:
if text[i] in self._mwes:
# possible MWE match
j = i
trie = self._mwes
last_match = -1
while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 :
trie = trie[text[j]]
j = j + 1
if Trie.LEAF in trie:
last_match = j
else:
if last_match > -1:
j = last_match
if Trie.LEAF in trie or last_match > -1:
# success!
result.append(self._separator.join(text[i:j]))
i = j
else:
# no match, so backtrack
result.append(text[i])
i += 1
else:
result.append(text[i])
i += 1
return result

View File

@@ -0,0 +1,179 @@
# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
# Contributors: Ozan Caglayan, Wiktor Stribizew
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
"""
import io
import re
from nltk.corpus import perluniprops
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import xml_unescape
class NISTTokenizer(TokenizerI):
"""
This NIST tokenizer is sentence-based instead of the original
paragraph-based tokenization from mteval-14.pl; The sentence-based
tokenization is consistent with the other tokenizers available in NLTK.
>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
>>> s = "Good muffins cost $3.88 in New York."
>>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
>>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
>>> nist.tokenize(s, lowercase=False) == expected_cased
True
>>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased.
True
The international_tokenize() is the preferred function when tokenizing
non-european text, e.g.
>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
# Input strings.
>>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
>>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
>>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
# Expected tokens.
>>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
>>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
>>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
>>> nist.international_tokenize(albb)[:10] == expected_albb
True
>>> nist.international_tokenize(amz)[:10] == expected_amz
True
>>> nist.international_tokenize(rkt)[:10] == expected_rkt
True
# Doctest for patching issue #1926
>>> sent = u'this is a foo\u2604sentence.'
>>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
>>> nist.international_tokenize(sent) == expected_sent
True
"""
# Strip "skipped" tags
STRIP_SKIP = re.compile("<skipped>"), ""
# Strip end-of-line hyphenation and join lines
STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
# Tokenize punctuation.
PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
# Tokenize period and comma unless preceded by a digit.
PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
# Tokenize period and comma unless followed by a digit.
PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
# Tokenize dash when preceded by a digit
DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
LANG_DEPENDENT_REGEXES = [
PUNCT,
PERIOD_COMMA_PRECEED,
PERIOD_COMMA_FOLLOW,
DASH_PRECEED_DIGIT,
]
# Perluniprops characters used in NIST tokenizer.
pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
# Python regexes needs to escape some special symbols, see
# see https://stackoverflow.com/q/45670950/610569
number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
# Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
# (i) strip trailing and heading spaces and
# (ii) de-deuplicate spaces.
# In Python, this would do: ' '.join(str.strip().split())
# Thus, the next two lines were commented out.
# Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
# Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
# Pads non-ascii strings with space.
NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
# Tokenize any punctuation unless followed AND preceded by a digit.
PUNCT_1 = (
re.compile(f"([{number_regex}])([{punct_regex}])"),
"\\1 \\2 ",
)
PUNCT_2 = (
re.compile(f"([{punct_regex}])([{number_regex}])"),
" \\1 \\2",
)
# Tokenize symbols
SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "
INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
def lang_independent_sub(self, text):
"""Performs the language independent string substituitions."""
# It's a strange order of regexes.
# It'll be better to unescape after STRIP_EOL_HYPHEN
# but let's keep it close to the original NIST implementation.
regexp, substitution = self.STRIP_SKIP
text = regexp.sub(substitution, text)
text = xml_unescape(text)
regexp, substitution = self.STRIP_EOL_HYPHEN
text = regexp.sub(substitution, text)
return text
def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
text = str(text)
# Language independent regex.
text = self.lang_independent_sub(text)
# Language dependent regex.
if western_lang:
# Pad string with whitespace.
text = " " + text + " "
if lowercase:
text = text.lower()
for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
text = regexp.sub(substitution, text)
# Remove contiguous whitespaces.
text = " ".join(text.split())
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
text = str(text.strip())
return text if return_str else text.split()
def international_tokenize(
self, text, lowercase=False, split_non_ascii=True, return_str=False
):
text = str(text)
# Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
# first before unescaping.
regexp, substitution = self.STRIP_SKIP
text = regexp.sub(substitution, text)
regexp, substitution = self.STRIP_EOL_HYPHEN
text = regexp.sub(substitution, text)
text = xml_unescape(text)
if lowercase:
text = text.lower()
for regexp, substitution in self.INTERNATIONAL_REGEXES:
text = regexp.sub(substitution, text)
# Make sure that there's only one space only between words.
# Strip leading and trailing spaces.
text = " ".join(text.strip().split())
return text if return_str else text.split()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,220 @@
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# Trevor Cohn <tacohn@csse.unimelb.edu.au>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
r"""
Regular-Expression Tokenizers
A ``RegexpTokenizer`` splits a string into substrings using a regular expression.
For example, the following tokenizer forms tokens out of alphabetic sequences,
money expressions, and any other non-whitespace sequences:
>>> from nltk.tokenize import RegexpTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
A ``RegexpTokenizer`` can use its regexp to match delimiters instead:
>>> tokenizer = RegexpTokenizer(r'\s+', gaps=True)
>>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
Note that empty tokens are not returned when the delimiter appears at
the start or end of the string.
The material between the tokens is discarded. For example,
the following tokenizer selects just the capitalized words:
>>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
>>> capword_tokenizer.tokenize(s)
['Good', 'New', 'York', 'Please', 'Thanks']
This module contains several subclasses of ``RegexpTokenizer``
that use pre-defined regular expressions.
>>> from nltk.tokenize import BlanklineTokenizer
>>> # Uses '\s*\n\s*\n\s*':
>>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.',
'Thanks.']
All of the regular expression tokenizers are also available as functions:
>>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
>>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.',
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> blankline_tokenize(s)
['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.']
Caution: The function ``regexp_tokenize()`` takes the text as its
first argument, and the regular expression pattern as its second
argument. This differs from the conventions used by Python's
``re`` functions, where the pattern is always the first argument.
(This is for consistency with the other NLTK tokenizers.)
"""
import re
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import regexp_span_tokenize
class RegexpTokenizer(TokenizerI):
r"""
A tokenizer that splits a string using a regular expression, which
matches either the tokens or the separators between tokens.
>>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
:type pattern: str
:param pattern: The pattern used to build this tokenizer.
(This pattern must not contain capturing parentheses;
Use non-capturing parentheses, e.g. (?:...), instead)
:type gaps: bool
:param gaps: True if this tokenizer's pattern should be used
to find separators between tokens; False if this
tokenizer's pattern should be used to find the tokens
themselves.
:type discard_empty: bool
:param discard_empty: True if any empty tokens `''`
generated by the tokenizer should be discarded. Empty
tokens can only be generated if `_gaps == True`.
:type flags: int
:param flags: The regexp flags used to compile this
tokenizer's pattern. By default, the following flags are
used: `re.UNICODE | re.MULTILINE | re.DOTALL`.
"""
def __init__(
self,
pattern,
gaps=False,
discard_empty=True,
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
):
# If they gave us a regexp object, extract the pattern.
pattern = getattr(pattern, "pattern", pattern)
self._pattern = pattern
self._gaps = gaps
self._discard_empty = discard_empty
self._flags = flags
self._regexp = None
def _check_regexp(self):
if self._regexp is None:
self._regexp = re.compile(self._pattern, self._flags)
def tokenize(self, text):
self._check_regexp()
# If our regexp matches gaps, use re.split:
if self._gaps:
if self._discard_empty:
return [tok for tok in self._regexp.split(text) if tok]
else:
return self._regexp.split(text)
# If our regexp matches tokens, use re.findall:
else:
return self._regexp.findall(text)
def span_tokenize(self, text):
self._check_regexp()
if self._gaps:
for left, right in regexp_span_tokenize(text, self._regexp):
if not (self._discard_empty and left == right):
yield left, right
else:
for m in re.finditer(self._regexp, text):
yield m.span()
def __repr__(self):
return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format(
self.__class__.__name__,
self._pattern,
self._gaps,
self._discard_empty,
self._flags,
)
class WhitespaceTokenizer(RegexpTokenizer):
r"""
Tokenize a string on whitespace (space, tab, newline).
In general, users should use the string ``split()`` method instead.
>>> from nltk.tokenize import WhitespaceTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
"""
def __init__(self):
RegexpTokenizer.__init__(self, r"\s+", gaps=True)
class BlanklineTokenizer(RegexpTokenizer):
"""
Tokenize a string, treating any sequence of blank lines as a delimiter.
Blank lines are defined as lines containing no characters, except for
space or tab characters.
"""
def __init__(self):
RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True)
class WordPunctTokenizer(RegexpTokenizer):
r"""
Tokenize a text into a sequence of alphabetic and
non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.
>>> from nltk.tokenize import WordPunctTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York',
'.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
"""
def __init__(self):
RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+")
######################################################################
# { Tokenization Functions
######################################################################
def regexp_tokenize(
text,
pattern,
gaps=False,
discard_empty=True,
flags=re.UNICODE | re.MULTILINE | re.DOTALL,
):
"""
Return a tokenized copy of *text*. See :class:`.RegexpTokenizer`
for descriptions of the arguments.
"""
tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags)
return tokenizer.tokenize(text)
blankline_tokenize = BlanklineTokenizer().tokenize
wordpunct_tokenize = WordPunctTokenizer().tokenize

View File

@@ -0,0 +1,149 @@
# Natural Language Toolkit: Interface to the Repp Tokenizer
#
# Copyright (C) 2001-2015 NLTK Project
# Authors: Rebecca Dridan and Stephan Oepen
# Contributors: Liling Tan
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import os
import re
import subprocess
import sys
import tempfile
from nltk.data import ZipFilePathPointer
from nltk.internals import find_dir
from nltk.tokenize.api import TokenizerI
class ReppTokenizer(TokenizerI):
"""
A class for word tokenization using the REPP parser described in
Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
Long Solved Problem - A Survey, Contrastive Experiment, Recommendations,
and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406
>>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
... ]
>>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
>>> for sent in sents: # doctest: +SKIP
... tokenizer.tokenize(sent) # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
... print(sent) # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
... print(sent) # doctest: +SKIP
...
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
"""
def __init__(self, repp_dir, encoding="utf8"):
self.repp_dir = self.find_repptokenizer(repp_dir)
# Set a directory to store the temporary files.
self.working_dir = tempfile.gettempdir()
# Set an encoding for the input strings.
self.encoding = encoding
def tokenize(self, sentence):
"""
Use Repp to tokenize a single sentence.
:param sentence: A single sentence string.
:type sentence: str
:return: A tuple of tokens.
:rtype: tuple(str)
"""
return next(self.tokenize_sents([sentence]))
def tokenize_sents(self, sentences, keep_token_positions=False):
"""
Tokenize multiple sentences using Repp.
:param sentences: A list of sentence strings.
:type sentences: list(str)
:return: A list of tuples of tokens
:rtype: iter(tuple(str))
"""
with tempfile.NamedTemporaryFile(
prefix="repp_input.", dir=self.working_dir, mode="w", delete=False
) as input_file:
# Write sentences to temporary input file.
for sent in sentences:
input_file.write(str(sent) + "\n")
input_file.close()
# Generate command to run REPP.
cmd = self.generate_repp_command(input_file.name)
# Decode the stdout and strips the ending newline.
repp_output = self._execute(cmd).decode(self.encoding).strip()
for tokenized_sent in self.parse_repp_outputs(repp_output):
if not keep_token_positions:
# Removes token position information.
tokenized_sent, starts, ends = zip(*tokenized_sent)
yield tokenized_sent
def generate_repp_command(self, inputfilename):
"""
This module generates the REPP command to be used at the terminal.
:param inputfilename: path to the input file
:type inputfilename: str
"""
cmd = [self.repp_dir + "/src/repp"]
cmd += ["-c", self.repp_dir + "/erg/repp.set"]
cmd += ["--format", "triple"]
cmd += [inputfilename]
return cmd
@staticmethod
def _execute(cmd):
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
return stdout
@staticmethod
def parse_repp_outputs(repp_output):
"""
This module parses the tri-tuple format that REPP outputs using the
"--format triple" option and returns an generator with tuple of string
tokens.
:param repp_output:
:type repp_output: type
:return: an iterable of the tokenized sentences as tuples of strings
:rtype: iter(tuple)
"""
line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE)
for section in repp_output.split("\n\n"):
words_with_positions = [
(token, int(start), int(end))
for start, end, token in line_regex.findall(section)
]
words = tuple(t[2] for t in words_with_positions)
yield words_with_positions
def find_repptokenizer(self, repp_dirname):
"""
A module to find REPP tokenizer binary and its *repp.set* config file.
"""
if os.path.exists(repp_dirname): # If a full path is given.
_repp_dir = repp_dirname
else: # Try to find path to REPP directory in environment variables.
_repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",))
# Checks for the REPP binary and erg/repp.set config file.
assert os.path.exists(_repp_dir + "/src/repp")
assert os.path.exists(_repp_dir + "/erg/repp.set")
return _repp_dir

View File

@@ -0,0 +1,140 @@
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Yoav Goldberg <yoavg@cs.bgu.ac.il>
# Steven Bird <stevenbird1@gmail.com> (minor edits)
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
S-Expression Tokenizer
``SExprTokenizer`` is used to find parenthesized expressions in a
string. In particular, it divides a string into a sequence of
substrings that are either parenthesized expressions (including any
nested parenthesized expressions), or other whitespace-separated
tokens.
>>> from nltk.tokenize import SExprTokenizer
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
By default, `SExprTokenizer` will raise a ``ValueError`` exception if
used to tokenize an expression with non-matching parentheses:
>>> SExprTokenizer().tokenize('c) d) e (f (g')
Traceback (most recent call last):
...
ValueError: Un-matched close paren at char 1
The ``strict`` argument can be set to False to allow for
non-matching parentheses. Any unmatched close parentheses will be
listed as their own s-expression; and the last partial sexpr with
unmatched open parentheses will be listed as its own sexpr:
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
['c', ')', 'd', ')', 'e', '(f (g']
The characters used for open and close parentheses may be customized
using the ``parens`` argument to the `SExprTokenizer` constructor:
>>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
['{a b {c d}}', 'e', 'f', '{g}']
The s-expression tokenizer is also available as a function:
>>> from nltk.tokenize import sexpr_tokenize
>>> sexpr_tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
"""
import re
from nltk.tokenize.api import TokenizerI
class SExprTokenizer(TokenizerI):
"""
A tokenizer that divides strings into s-expressions.
An s-expresion can be either:
- a parenthesized expression, including any nested parenthesized
expressions, or
- a sequence of non-whitespace non-parenthesis characters.
For example, the string ``(a (b c)) d e (f)`` consists of four
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.
By default, the characters ``(`` and ``)`` are treated as open and
close parentheses, but alternative strings may be specified.
:param parens: A two-element sequence specifying the open and close parentheses
that should be used to find sexprs. This will typically be either a
two-character string, or a list of two strings.
:type parens: str or list
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
"""
def __init__(self, parens="()", strict=True):
if len(parens) != 2:
raise ValueError("parens must contain exactly two strings")
self._strict = strict
self._open_paren = parens[0]
self._close_paren = parens[1]
self._paren_regexp = re.compile(
f"{re.escape(parens[0])}|{re.escape(parens[1])}"
)
def tokenize(self, text):
"""
Return a list of s-expressions extracted from *text*.
For example:
>>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
['(a b (c d))', 'e', 'f', '(g)']
All parentheses are assumed to mark s-expressions.
(No special processing is done to exclude parentheses that occur
inside strings, or following backslash characters.)
If the given expression contains non-matching parentheses,
then the behavior of the tokenizer depends on the ``strict``
parameter to the constructor. If ``strict`` is ``True``, then
raise a ``ValueError``. If ``strict`` is ``False``, then any
unmatched close parentheses will be listed as their own
s-expression; and the last partial s-expression with unmatched open
parentheses will be listed as its own s-expression:
>>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
['c', ')', 'd', ')', 'e', '(f (g']
:param text: the string to be tokenized
:type text: str or iter(str)
:rtype: iter(str)
"""
result = []
pos = 0
depth = 0
for m in self._paren_regexp.finditer(text):
paren = m.group()
if depth == 0:
result += text[pos : m.start()].split()
pos = m.start()
if paren == self._open_paren:
depth += 1
if paren == self._close_paren:
if self._strict and depth == 0:
raise ValueError("Un-matched close paren at char %d" % m.start())
depth = max(0, depth - 1)
if depth == 0:
result.append(text[pos : m.end()])
pos = m.end()
if self._strict and depth > 0:
raise ValueError("Un-matched open paren at char %d" % pos)
if pos < len(text):
result.append(text[pos:])
return result
sexpr_tokenize = SExprTokenizer().tokenize

View File

@@ -0,0 +1,139 @@
# Natural Language Toolkit: Simple Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
r"""
Simple Tokenizers
These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.
The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> s.split() # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
>>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
>>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', '', 'Thanks.']
The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer. For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.
"""
from nltk.tokenize.api import StringTokenizer, TokenizerI
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
class SpaceTokenizer(StringTokenizer):
r"""Tokenize a string using the space character as a delimiter,
which is the same as ``s.split(' ')``.
>>> from nltk.tokenize import SpaceTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
"""
_string = " "
class TabTokenizer(StringTokenizer):
r"""Tokenize a string use the tab character as a delimiter,
the same as ``s.split('\t')``.
>>> from nltk.tokenize import TabTokenizer
>>> TabTokenizer().tokenize('a\tb c\n\t d')
['a', 'b c\n', ' d']
"""
_string = "\t"
class CharTokenizer(StringTokenizer):
"""Tokenize a string into individual characters. If this functionality
is ever required directly, use ``for char in string``.
"""
_string = None
def tokenize(self, s):
return list(s)
def span_tokenize(self, s):
yield from enumerate(range(1, len(s) + 1))
class LineTokenizer(TokenizerI):
r"""Tokenize a string into its lines, optionally discarding blank lines.
This is similar to ``s.split('\n')``.
>>> from nltk.tokenize import LineTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
>>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', '', 'Thanks.']
>>> # same as [l for l in s.split('\n') if l.strip()]:
>>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good muffins cost $3.88', 'in New York. Please buy me',
'two of them.', 'Thanks.']
:param blanklines: Indicates how blank lines should be handled. Valid values are:
- ``discard``: strip blank lines out of the token list before returning it.
A line is considered blank if it contains only whitespace characters.
- ``keep``: leave all blank lines in the token list.
- ``discard-eof``: if the string ends with a newline, then do not generate
a corresponding token ``''`` after that newline.
"""
def __init__(self, blanklines="discard"):
valid_blanklines = ("discard", "keep", "discard-eof")
if blanklines not in valid_blanklines:
raise ValueError(
"Blank lines must be one of: %s" % " ".join(valid_blanklines)
)
self._blanklines = blanklines
def tokenize(self, s):
lines = s.splitlines()
# If requested, strip off blank lines.
if self._blanklines == "discard":
lines = [l for l in lines if l.rstrip()]
elif self._blanklines == "discard-eof":
if lines and not lines[-1].strip():
lines.pop()
return lines
# discard-eof not implemented
def span_tokenize(self, s):
if self._blanklines == "keep":
yield from string_span_tokenize(s, r"\n")
else:
yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
######################################################################
# { Tokenization Functions
######################################################################
# XXX: it is stated in module docs that there is no function versions
def line_tokenize(text, blanklines="discard"):
return LineTokenizer(blanklines).tokenize(text)

View File

@@ -0,0 +1,194 @@
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Christopher Hench <chris.l.hench@gmail.com>
# Alex Estes
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed
by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the
openness of the lips. Syllable breaks occur before troughs in sonority. For more
on the SSP see Selkirk (1984).
The default implementation uses the English alphabet, but the `sonority_hiearchy`
can be modified to IPA or any other alphabet for the use-case. The SSP is a
universal syllabification algorithm, but that does not mean it performs equally
across languages. Bartlett et al. (2009) is a good benchmark for English accuracy
if utilizing IPA (pg. 311).
Importantly, if a custom hierarchy is supplied and vowels span across more than
one level, they should be given separately to the `vowels` class attribute.
References:
- Otto Jespersen. 1904. Lehrbuch der Phonetik.
Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203.
- Elisabeth Selkirk. 1984. On the major class features and syllable theory.
In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology.
Cambridge, MIT Press. pp. 107-136.
- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes.
In HLT-NAACL. pp. 308-316.
"""
import re
import warnings
from string import punctuation
from nltk.tokenize.api import TokenizerI
from nltk.util import ngrams
class SyllableTokenizer(TokenizerI):
"""
Syllabifies words based on the Sonority Sequencing Principle (SSP).
>>> from nltk.tokenize import SyllableTokenizer
>>> from nltk import word_tokenize
>>> SSP = SyllableTokenizer()
>>> SSP.tokenize('justification')
['jus', 'ti', 'fi', 'ca', 'tion']
>>> text = "This is a foobar-like sentence."
>>> [SSP.tokenize(token) for token in word_tokenize(text)]
[['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
"""
def __init__(self, lang="en", sonority_hierarchy=False):
"""
:param lang: Language parameter, default is English, 'en'
:type lang: str
:param sonority_hierarchy: Sonority hierarchy according to the
Sonority Sequencing Principle.
:type sonority_hierarchy: list(str)
"""
# Sonority hierarchy should be provided in descending order.
# If vowels are spread across multiple levels, they should be
# passed assigned self.vowels var together, otherwise should be
# placed in first index of hierarchy.
if not sonority_hierarchy and lang == "en":
sonority_hierarchy = [
"aeiouy", # vowels.
"lmnrw", # nasals.
"zvsf", # fricatives.
"bcdgtkpqxhj", # stops.
]
self.vowels = sonority_hierarchy[0]
self.phoneme_map = {}
for i, level in enumerate(sonority_hierarchy):
for c in level:
sonority_level = len(sonority_hierarchy) - i
self.phoneme_map[c] = sonority_level
self.phoneme_map[c.upper()] = sonority_level
def assign_values(self, token):
"""
Assigns each phoneme its value from the sonority hierarchy.
Note: Sentence/text has to be tokenized first.
:param token: Single word or token
:type token: str
:return: List of tuples, first element is character/phoneme and
second is the soronity value.
:rtype: list(tuple(str, int))
"""
syllables_values = []
for c in token:
try:
syllables_values.append((c, self.phoneme_map[c]))
except KeyError:
if c not in "0123456789" and c not in punctuation:
warnings.warn(
"Character not defined in sonority_hierarchy,"
" assigning as vowel: '{}'".format(c)
)
syllables_values.append((c, max(self.phoneme_map.values())))
if c not in self.vowels:
self.vowels += c
else: # If it's a punctuation or numbers, assign -1.
syllables_values.append((c, -1))
return syllables_values
def validate_syllables(self, syllable_list):
"""
Ensures each syllable has at least one vowel.
If the following syllable doesn't have vowel, add it to the current one.
:param syllable_list: Single word or token broken up into syllables.
:type syllable_list: list(str)
:return: Single word or token broken up into syllables
(with added syllables if necessary)
:rtype: list(str)
"""
valid_syllables = []
front = ""
vowel_pattern = re.compile("|".join(self.vowels))
for i, syllable in enumerate(syllable_list):
if syllable in punctuation:
valid_syllables.append(syllable)
continue
if not vowel_pattern.search(syllable):
if len(valid_syllables) == 0:
front += syllable
else:
valid_syllables = valid_syllables[:-1] + [
valid_syllables[-1] + syllable
]
else:
if len(valid_syllables) == 0:
valid_syllables.append(front + syllable)
else:
valid_syllables.append(syllable)
return valid_syllables
def tokenize(self, token):
"""
Apply the SSP to return a list of syllables.
Note: Sentence/text has to be tokenized first.
:param token: Single word or token
:type token: str
:return syllable_list: Single word or token broken up into syllables.
:rtype: list(str)
"""
# assign values from hierarchy
syllables_values = self.assign_values(token)
# if only one vowel return word
if sum(token.count(x) for x in self.vowels) <= 1:
return [token]
syllable_list = []
syllable = syllables_values[0][0] # start syllable with first phoneme
for trigram in ngrams(syllables_values, n=3):
phonemes, values = zip(*trigram)
# Sonority of previous, focal and following phoneme
prev_value, focal_value, next_value = values
# Focal phoneme.
focal_phoneme = phonemes[1]
# These cases trigger syllable break.
if focal_value == -1: # If it's a punctuation, just break.
syllable_list.append(syllable)
syllable_list.append(focal_phoneme)
syllable = ""
elif prev_value >= focal_value == next_value:
syllable += focal_phoneme
syllable_list.append(syllable)
syllable = ""
elif prev_value > focal_value < next_value:
syllable_list.append(syllable)
syllable = ""
syllable += focal_phoneme
# no syllable break
else:
syllable += focal_phoneme
syllable += syllables_values[-1][0] # append last phoneme
syllable_list.append(syllable)
return self.validate_syllables(syllable_list)

View File

@@ -0,0 +1,115 @@
# Natural Language Toolkit: Interface to the Stanford Tokenizer
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Xu <xxu@student.unimelb.edu.au>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import json
import os
import tempfile
import warnings
from subprocess import PIPE
from nltk.internals import _java_options, config_java, find_jar, java
from nltk.parse.corenlp import CoreNLPParser
from nltk.tokenize.api import TokenizerI
_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml"
class StanfordTokenizer(TokenizerI):
r"""
Interface to the Stanford Tokenizer
>>> from nltk.tokenize.stanford import StanfordTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
>>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> s = "The colour of the wall is blue."
>>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
"""
_JAR = "stanford-postagger.jar"
def __init__(
self,
path_to_jar=None,
encoding="utf8",
options=None,
verbose=False,
java_options="-mx1000m",
):
# Raise deprecation warning.
warnings.warn(
str(
"\nThe StanfordTokenizer will "
"be deprecated in version 3.2.5.\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'"
),
DeprecationWarning,
stacklevel=2,
)
self._stanford_jar = find_jar(
self._JAR,
path_to_jar,
env_vars=("STANFORD_POSTAGGER",),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items())
@staticmethod
def _parse_tokenized_output(s):
return s.splitlines()
def tokenize(self, s):
"""
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
"""
cmd = ["edu.stanford.nlp.process.PTBTokenizer"]
return self._parse_tokenized_output(self._execute(cmd, s))
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
cmd.extend(["-charset", encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(["-options", self._options_cmd])
default_options = " ".join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file:
# Write the actual sentences to the temporary input file
if isinstance(input_, str) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
cmd.append(input_file.name)
# Run the tagger and get the output.
stdout, stderr = java(
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
)
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout

View File

@@ -0,0 +1,292 @@
#!/usr/bin/env python
# Natural Language Toolkit: Interface to the Stanford Segmenter
# for Chinese and Arabic
#
# Copyright (C) 2001-2025 NLTK Project
# Author: 52nlp <52nlpcn@gmail.com>
# Casper Lehmann-Strøm <casperlehmann@gmail.com>
# Alex Constantin <alex@keyworder.ch>
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import json
import os
import tempfile
import warnings
from subprocess import PIPE
from nltk.internals import (
_java_options,
config_java,
find_dir,
find_file,
find_jar,
java,
)
from nltk.tokenize.api import TokenizerI
_stanford_url = "https://nlp.stanford.edu/software"
class StanfordSegmenter(TokenizerI):
"""Interface to the Stanford Segmenter
If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
should be provieded, for example::
seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')
>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> seg = StanfordSegmenter() # doctest: +SKIP
>>> seg.default_config('zh') # doctest: +SKIP
>>> sent = u'这是斯坦福中文分词器测试'
>>> print(seg.segment(sent)) # doctest: +SKIP
\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5
<BLANKLINE>
>>> seg.default_config('ar') # doctest: +SKIP
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
>>> print(seg.segment(sent.split())) # doctest: +SKIP
\u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a
<BLANKLINE>
"""
_JAR = "stanford-segmenter.jar"
def __init__(
self,
path_to_jar=None,
path_to_slf4j=None,
java_class=None,
path_to_model=None,
path_to_dict=None,
path_to_sihan_corpora_dict=None,
sihan_post_processing="false",
keep_whitespaces="false",
encoding="UTF-8",
options=None,
verbose=False,
java_options="-mx2g",
):
# Raise deprecation warning.
warnings.simplefilter("always", DeprecationWarning)
warnings.warn(
str(
"\nThe StanfordTokenizer will "
"be deprecated in version 3.2.5.\n"
"Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'"
),
DeprecationWarning,
stacklevel=2,
)
warnings.simplefilter("ignore", DeprecationWarning)
stanford_segmenter = find_jar(
self._JAR,
path_to_jar,
env_vars=("STANFORD_SEGMENTER",),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
if path_to_slf4j is not None:
slf4j = find_jar(
"slf4j-api.jar",
path_to_slf4j,
env_vars=("SLF4J", "STANFORD_SEGMENTER"),
searchpath=(),
url=_stanford_url,
verbose=verbose,
)
else:
slf4j = None
# This is passed to java as the -cp option, the old version of segmenter needs slf4j.
# The new version of stanford-segmenter-2016-10-31 doesn't need slf4j
self._stanford_jar = os.pathsep.join(
_ for _ in [stanford_segmenter, slf4j] if _ is not None
)
self._java_class = java_class
self._model = path_to_model
self._sihan_corpora_dict = path_to_sihan_corpora_dict
self._sihan_post_processing = sihan_post_processing
self._keep_whitespaces = keep_whitespaces
self._dict = path_to_dict
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ",".join(
f"{key}={json.dumps(val)}" for key, val in options.items()
)
def default_config(self, lang):
"""
Attempt to initialize Stanford Word Segmenter for the specified language
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
"""
search_path = ()
if os.environ.get("STANFORD_SEGMENTER"):
search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")}
# init for Chinese-specific files
self._dict = None
self._sihan_corpora_dict = None
self._sihan_post_processing = "false"
if lang == "ar":
self._java_class = (
"edu.stanford.nlp.international.arabic.process.ArabicSegmenter"
)
model = "arabic-segmenter-atb+bn+arztrain.ser.gz"
elif lang == "zh":
self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier"
model = "pku.gz"
self._sihan_post_processing = "true"
path_to_dict = "dict-chris6.ser.gz"
try:
self._dict = find_file(
path_to_dict,
searchpath=search_path,
url=_stanford_url,
verbose=False,
env_vars=("STANFORD_MODELS",),
)
except LookupError as e:
raise LookupError(
"Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)"
% path_to_dict
) from e
sihan_dir = "./data/"
try:
path_to_sihan_dir = find_dir(
sihan_dir,
url=_stanford_url,
verbose=False,
env_vars=("STANFORD_SEGMENTER",),
)
self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir)
except LookupError as e:
raise LookupError(
"Could not find '%s' (tried using the "
"STANFORD_SEGMENTER environment variable)" % sihan_dir
) from e
else:
raise LookupError(f"Unsupported language {lang}")
try:
self._model = find_file(
model,
searchpath=search_path,
url=_stanford_url,
verbose=False,
env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"),
)
except LookupError as e:
raise LookupError(
"Could not find '%s' (tried using env. "
"variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)" % model
) from e
def tokenize(self, s):
super().tokenize(s)
def segment_file(self, input_file_path):
""" """
cmd = [
self._java_class,
"-loadClassifier",
self._model,
"-keepAllWhitespaces",
self._keep_whitespaces,
"-textFile",
input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
"-serDictionary",
self._dict,
"-sighanCorporaDict",
self._sihan_corpora_dict,
"-sighanPostProcessing",
self._sihan_post_processing,
]
)
stdout = self._execute(cmd)
return stdout
def segment(self, tokens):
return self.segment_sents([tokens])
def segment_sents(self, sentences):
""" """
encoding = self._encoding
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, "wb")
_input = "\n".join(" ".join(x) for x in sentences)
if isinstance(_input, str) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
self._java_class,
"-loadClassifier",
self._model,
"-keepAllWhitespaces",
self._keep_whitespaces,
"-textFile",
self._input_file_path,
]
if self._sihan_corpora_dict is not None:
cmd.extend(
[
"-serDictionary",
self._dict,
"-sighanCorporaDict",
self._sihan_corpora_dict,
"-sighanPostProcessing",
self._sihan_post_processing,
]
)
stdout = self._execute(cmd)
# Delete the temporary file
os.unlink(self._input_file_path)
return stdout
def _execute(self, cmd, verbose=False):
encoding = self._encoding
cmd.extend(["-inputEncoding", encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(["-options", self._options_cmd])
default_options = " ".join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
stdout, _stderr = java(
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
)
stdout = stdout.decode(encoding)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout

View File

@@ -0,0 +1,474 @@
# Natural Language Toolkit: TextTiling
#
# Copyright (C) 2001-2025 NLTK Project
# Author: George Boutsioukis
#
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import math
import re
try:
import numpy
except ImportError:
pass
from nltk.tokenize.api import TokenizerI
BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1
LC, HC = 0, 1
DEFAULT_SMOOTHING = [0]
class TextTilingTokenizer(TokenizerI):
"""Tokenize a document into topical sections using the TextTiling algorithm.
This algorithm detects subtopic shifts based on the analysis of lexical
co-occurrence patterns.
The process starts by tokenizing the text into pseudosentences of
a fixed size w. Then, depending on the method used, similarity
scores are assigned at sentence gaps. The algorithm proceeds by
detecting the peak differences between these scores and marking
them as boundaries. The boundaries are normalized to the closest
paragraph break and the segmented text is returned.
:param w: Pseudosentence size
:type w: int
:param k: Size (in sentences) of the block used in the block comparison method
:type k: int
:param similarity_method: The method used for determining similarity scores:
`BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`.
:type similarity_method: constant
:param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus)
:type stopwords: list(str)
:param smoothing_method: The method used for smoothing the score plot:
`DEFAULT_SMOOTHING` (default)
:type smoothing_method: constant
:param smoothing_width: The width of the window used by the smoothing method
:type smoothing_width: int
:param smoothing_rounds: The number of smoothing passes
:type smoothing_rounds: int
:param cutoff_policy: The policy used to determine the number of boundaries:
`HC` (default) or `LC`
:type cutoff_policy: constant
>>> from nltk.corpus import brown
>>> tt = TextTilingTokenizer(demo_mode=True)
>>> text = brown.raw()[:4000]
>>> s, ss, d, b = tt.tokenize(text)
>>> b
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]
"""
def __init__(
self,
w=20,
k=10,
similarity_method=BLOCK_COMPARISON,
stopwords=None,
smoothing_method=DEFAULT_SMOOTHING,
smoothing_width=2,
smoothing_rounds=1,
cutoff_policy=HC,
demo_mode=False,
):
if stopwords is None:
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
self.__dict__.update(locals())
del self.__dict__["self"]
def tokenize(self, text):
"""Return a tokenized copy of *text*, where each "token" represents
a separate topic."""
lowercase_text = text.lower()
paragraph_breaks = self._mark_paragraph_breaks(text)
text_length = len(lowercase_text)
# Tokenization step starts here
# Remove punctuation
nopunct_text = "".join(
c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c)
)
nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text)
tokseqs = self._divide_to_tokensequences(nopunct_text)
# The morphological stemming step mentioned in the TextTile
# paper is not implemented. A comment in the original C
# implementation states that it offers no benefit to the
# process. It might be interesting to test the existing
# stemmers though.
# words = _stem_words(words)
# Filter stopwords
for ts in tokseqs:
ts.wrdindex_list = [
wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords
]
token_table = self._create_token_table(tokseqs, nopunct_par_breaks)
# End of the Tokenization step
# Lexical score determination
if self.similarity_method == BLOCK_COMPARISON:
gap_scores = self._block_comparison(tokseqs, token_table)
elif self.similarity_method == VOCABULARY_INTRODUCTION:
raise NotImplementedError("Vocabulary introduction not implemented")
else:
raise ValueError(
f"Similarity method {self.similarity_method} not recognized"
)
if self.smoothing_method == DEFAULT_SMOOTHING:
smooth_scores = self._smooth_scores(gap_scores)
else:
raise ValueError(f"Smoothing method {self.smoothing_method} not recognized")
# End of Lexical score Determination
# Boundary identification
depth_scores = self._depth_scores(smooth_scores)
segment_boundaries = self._identify_boundaries(depth_scores)
normalized_boundaries = self._normalize_boundaries(
text, segment_boundaries, paragraph_breaks
)
# End of Boundary Identification
segmented_text = []
prevb = 0
for b in normalized_boundaries:
if b == 0:
continue
segmented_text.append(text[prevb:b])
prevb = b
if prevb < text_length: # append any text that may be remaining
segmented_text.append(text[prevb:])
if not segmented_text:
segmented_text = [text]
if self.demo_mode:
return gap_scores, smooth_scores, depth_scores, segment_boundaries
return segmented_text
def _block_comparison(self, tokseqs, token_table):
"""Implements the block comparison method"""
def blk_frq(tok, block):
ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences)
freq = sum(tsocc[1] for tsocc in ts_occs)
return freq
gap_scores = []
numgaps = len(tokseqs) - 1
for curr_gap in range(numgaps):
score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0
score = 0.0
# adjust window size for boundary conditions
if curr_gap < self.k - 1:
window_size = curr_gap + 1
elif curr_gap > numgaps - self.k:
window_size = numgaps - curr_gap
else:
window_size = self.k
b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]]
b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]]
for t in token_table:
score_dividend += blk_frq(t, b1) * blk_frq(t, b2)
score_divisor_b1 += blk_frq(t, b1) ** 2
score_divisor_b2 += blk_frq(t, b2) ** 2
try:
score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2)
except ZeroDivisionError:
pass # score += 0.0
gap_scores.append(score)
return gap_scores
def _smooth_scores(self, gap_scores):
"Wraps the smooth function from the SciPy Cookbook"
return list(
smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1)
)
def _mark_paragraph_breaks(self, text):
"""Identifies indented text or line breaks as the beginning of
paragraphs"""
MIN_PARAGRAPH = 100
pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*")
matches = pattern.finditer(text)
last_break = 0
pbreaks = [0]
for pb in matches:
if pb.start() - last_break < MIN_PARAGRAPH:
continue
else:
pbreaks.append(pb.start())
last_break = pb.start()
return pbreaks
def _divide_to_tokensequences(self, text):
"Divides the text into pseudosentences of fixed size"
w = self.w
wrdindex_list = []
matches = re.finditer(r"\w+", text)
for match in matches:
wrdindex_list.append((match.group(), match.start()))
return [
TokenSequence(i / w, wrdindex_list[i : i + w])
for i in range(0, len(wrdindex_list), w)
]
def _create_token_table(self, token_sequences, par_breaks):
"Creates a table of TokenTableFields"
token_table = {}
current_par = 0
current_tok_seq = 0
pb_iter = par_breaks.__iter__()
current_par_break = next(pb_iter)
if current_par_break == 0:
try:
current_par_break = next(pb_iter) # skip break at 0
except StopIteration as e:
raise ValueError(
"No paragraph breaks were found(text too short perhaps?)"
) from e
for ts in token_sequences:
for word, index in ts.wrdindex_list:
try:
while index > current_par_break:
current_par_break = next(pb_iter)
current_par += 1
except StopIteration:
# hit bottom
pass
if word in token_table:
token_table[word].total_count += 1
if token_table[word].last_par != current_par:
token_table[word].last_par = current_par
token_table[word].par_count += 1
if token_table[word].last_tok_seq != current_tok_seq:
token_table[word].last_tok_seq = current_tok_seq
token_table[word].ts_occurences.append([current_tok_seq, 1])
else:
token_table[word].ts_occurences[-1][1] += 1
else: # new word
token_table[word] = TokenTableField(
first_pos=index,
ts_occurences=[[current_tok_seq, 1]],
total_count=1,
par_count=1,
last_par=current_par,
last_tok_seq=current_tok_seq,
)
current_tok_seq += 1
return token_table
def _identify_boundaries(self, depth_scores):
"""Identifies boundaries at the peaks of similarity score
differences"""
boundaries = [0 for x in depth_scores]
avg = sum(depth_scores) / len(depth_scores)
stdev = numpy.std(depth_scores)
if self.cutoff_policy == LC:
cutoff = avg - stdev
else:
cutoff = avg - stdev / 2.0
depth_tuples = sorted(zip(depth_scores, range(len(depth_scores))))
depth_tuples.reverse()
hp = list(filter(lambda x: x[0] > cutoff, depth_tuples))
for dt in hp:
boundaries[dt[1]] = 1
for dt2 in hp: # undo if there is a boundary close already
if (
dt[1] != dt2[1]
and abs(dt2[1] - dt[1]) < 4
and boundaries[dt2[1]] == 1
):
boundaries[dt[1]] = 0
return boundaries
def _depth_scores(self, scores):
"""Calculates the depth of each gap, i.e. the average difference
between the left and right peaks and the gap's score"""
depth_scores = [0 for x in scores]
# clip boundaries: this holds on the rule of thumb(my thumb)
# that a section shouldn't be smaller than at least 2
# pseudosentences for small texts and around 5 for larger ones.
clip = min(max(len(scores) // 10, 2), 5)
index = clip
for gapscore in scores[clip:-clip]:
lpeak = gapscore
for score in scores[index::-1]:
if score >= lpeak:
lpeak = score
else:
break
rpeak = gapscore
for score in scores[index:]:
if score >= rpeak:
rpeak = score
else:
break
depth_scores[index] = lpeak + rpeak - 2 * gapscore
index += 1
return depth_scores
def _normalize_boundaries(self, text, boundaries, paragraph_breaks):
"""Normalize the boundaries identified to the original text's
paragraph breaks"""
norm_boundaries = []
char_count, word_count, gaps_seen = 0, 0, 0
seen_word = False
for char in text:
char_count += 1
if char in " \t\n" and seen_word:
seen_word = False
word_count += 1
if char not in " \t\n" and not seen_word:
seen_word = True
if gaps_seen < len(boundaries) and word_count > (
max(gaps_seen * self.w, self.w)
):
if boundaries[gaps_seen] == 1:
# find closest paragraph break
best_fit = len(text)
for br in paragraph_breaks:
if best_fit > abs(br - char_count):
best_fit = abs(br - char_count)
bestbr = br
else:
break
if bestbr not in norm_boundaries: # avoid duplicates
norm_boundaries.append(bestbr)
gaps_seen += 1
return norm_boundaries
class TokenTableField:
"""A field in the token table holding parameters for each token,
used later in the process"""
def __init__(
self,
first_pos,
ts_occurences,
total_count=1,
par_count=1,
last_par=0,
last_tok_seq=None,
):
self.__dict__.update(locals())
del self.__dict__["self"]
class TokenSequence:
"A token list with its original length and its index"
def __init__(self, index, wrdindex_list, original_length=None):
original_length = original_length or len(wrdindex_list)
self.__dict__.update(locals())
del self.__dict__["self"]
# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth
def smooth(x, window_len=11, window="flat"):
"""smooth the data using a window with requested size.
This method is based on the convolution of a scaled window with the signal.
The signal is prepared by introducing reflected copies of the signal
(with the window size) in both ends so that transient parts are minimized
in the beginning and end part of the output signal.
:param x: the input signal
:param window_len: the dimension of the smoothing window; should be an odd integer
:param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
flat window will produce a moving average smoothing.
:return: the smoothed signal
example::
t=linspace(-2,2,0.1)
x=sin(t)+randn(len(t))*0.1
y=smooth(x)
:see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve,
scipy.signal.lfilter
TODO: the window parameter could be the window itself if an array instead of a string
"""
if x.ndim != 1:
raise ValueError("smooth only accepts 1 dimension arrays.")
if x.size < window_len:
raise ValueError("Input vector needs to be bigger than window size.")
if window_len < 3:
return x
if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]:
raise ValueError(
"Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
)
s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]]
# print(len(s))
if window == "flat": # moving average
w = numpy.ones(window_len, "d")
else:
w = eval("numpy." + window + "(window_len)")
y = numpy.convolve(w / w.sum(), s, mode="same")
return y[window_len - 1 : -window_len + 1]
def demo(text=None):
from matplotlib import pylab
from nltk.corpus import brown
tt = TextTilingTokenizer(demo_mode=True)
if text is None:
text = brown.raw()[:10000]
s, ss, d, b = tt.tokenize(text)
pylab.xlabel("Sentence Gap index")
pylab.ylabel("Gap Scores")
pylab.plot(range(len(s)), s, label="Gap Scores")
pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores")
pylab.plot(range(len(d)), d, label="Depth scores")
pylab.stem(range(len(b)), b)
pylab.legend()
pylab.show()

View File

@@ -0,0 +1,180 @@
# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer.
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Jon Dehdari
# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters,
# Alex Rudnick
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.
Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.
Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
"""
import re
from nltk.tokenize.api import TokenizerI
class ToktokTokenizer(TokenizerI):
"""
This is a Python port of the tok-tok.pl from
https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl
>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> print(toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
>>> print(toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
>>> assert toktok.tokenize(text, return_str=True) == expected
>>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
True
"""
# Replace non-breaking spaces with normal spaces.
NON_BREAKING = re.compile("\u00A0"), " "
# Pad some funky punctuation.
FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 "
# Pad more funky punctuation.
FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 "
# Pad En dash and em dash
EN_EM_DASHES = re.compile("([–—])"), r" \1 "
# Replace problematic character with numeric character reference.
AMPERCENT = re.compile("& "), "&amp; "
TAB = re.compile("\t"), " &#9; "
PIPE = re.compile(r"\|"), " &#124; "
# Pad numbers with commas to keep them from further tokenization.
COMMA_IN_NUM = re.compile(r"(?<!,)([,،])(?![,\d])"), r" \1 "
# Just pad problematic (often neurotic) hyphen/single quote, etc.
PROB_SINGLE_QUOTES = re.compile(r"(['`])"), r" \1 "
# Group ` ` stupid quotes ' ' into a single token.
STUPID_QUOTES_1 = re.compile(r" ` ` "), r" `` "
STUPID_QUOTES_2 = re.compile(r" ' ' "), r" '' "
# Don't tokenize period unless it ends the line and that it isn't
# preceded by another period, e.g.
# "something ..." -> "something ..."
# "something." -> "something ."
FINAL_PERIOD_1 = re.compile(r"(?<!\.)\.$"), r" ."
# Don't tokenize period unless it ends the line eg.
# " ... stuff." -> "... stuff ."
FINAL_PERIOD_2 = re.compile(r"""(?<!\.)\.\s*(["'’»›”]) *$"""), r" . \1"
# Treat continuous commas as fake German,Czech, etc.: „
MULTI_COMMAS = re.compile(r"(,{2,})"), r" \1 "
# Treat continuous dashes as fake en-dash, etc.
MULTI_DASHES = re.compile(r"(-{2,})"), r" \1 "
# Treat multiple periods as a thing (eg. ellipsis)
MULTI_DOTS = re.compile(r"(\.{2,})"), r" \1 "
# This is the \p{Open_Punctuation} from Perl's perluniprops
# see https://perldoc.perl.org/perluniprops.html
OPEN_PUNCT = str(
"([{\u0f3a\u0f3c\u169b\u201a\u201e\u2045\u207d"
"\u208d\u2329\u2768\u276a\u276c\u276e\u2770\u2772"
"\u2774\u27c5\u27e6\u27e8\u27ea\u27ec\u27ee\u2983"
"\u2985\u2987\u2989\u298b\u298d\u298f\u2991\u2993"
"\u2995\u2997\u29d8\u29da\u29fc\u2e22\u2e24\u2e26"
"\u2e28\u3008\u300a\u300c\u300e\u3010\u3014\u3016"
"\u3018\u301a\u301d\ufd3e\ufe17\ufe35\ufe37\ufe39"
"\ufe3b\ufe3d\ufe3f\ufe41\ufe43\ufe47\ufe59\ufe5b"
"\ufe5d\uff08\uff3b\uff5b\uff5f\uff62"
)
# This is the \p{Close_Punctuation} from Perl's perluniprops
CLOSE_PUNCT = str(
")]}\u0f3b\u0f3d\u169c\u2046\u207e\u208e\u232a"
"\u2769\u276b\u276d\u276f\u2771\u2773\u2775\u27c6"
"\u27e7\u27e9\u27eb\u27ed\u27ef\u2984\u2986\u2988"
"\u298a\u298c\u298e\u2990\u2992\u2994\u2996\u2998"
"\u29d9\u29db\u29fd\u2e23\u2e25\u2e27\u2e29\u3009"
"\u300b\u300d\u300f\u3011\u3015\u3017\u3019\u301b"
"\u301e\u301f\ufd3f\ufe18\ufe36\ufe38\ufe3a\ufe3c"
"\ufe3e\ufe40\ufe42\ufe44\ufe48\ufe5a\ufe5c\ufe5e"
"\uff09\uff3d\uff5d\uff60\uff63"
)
# This is the \p{Close_Punctuation} from Perl's perluniprops
CURRENCY_SYM = str(
"$\xa2\xa3\xa4\xa5\u058f\u060b\u09f2\u09f3\u09fb"
"\u0af1\u0bf9\u0e3f\u17db\u20a0\u20a1\u20a2\u20a3"
"\u20a4\u20a5\u20a6\u20a7\u20a8\u20a9\u20aa\u20ab"
"\u20ac\u20ad\u20ae\u20af\u20b0\u20b1\u20b2\u20b3"
"\u20b4\u20b5\u20b6\u20b7\u20b8\u20b9\u20ba\ua838"
"\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6"
)
# Pad spaces after opening punctuations.
OPEN_PUNCT_RE = re.compile(f"([{OPEN_PUNCT}])"), r"\1 "
# Pad spaces before closing punctuations.
CLOSE_PUNCT_RE = re.compile(f"([{CLOSE_PUNCT}])"), r"\1 "
# Pad spaces after currency symbols.
CURRENCY_SYM_RE = re.compile(f"([{CURRENCY_SYM}])"), r"\1 "
# Use for tokenizing URL-unfriendly characters: [:/?#]
URL_FOE_1 = re.compile(r":(?!//)"), r" : " # in perl s{:(?!//)}{ : }g;
URL_FOE_2 = re.compile(r"\?(?!\S)"), r" ? " # in perl s{\?(?!\S)}{ ? }g;
# in perl: m{://} or m{\S+\.\S+/\S+} or s{/}{ / }g;
URL_FOE_3 = re.compile(r"(:\/\/)[\S+\.\S+\/\S+][\/]"), " / "
URL_FOE_4 = re.compile(r" /"), r" / " # s{ /}{ / }g;
# Left/Right strip, i.e. remove heading/trailing spaces.
# These strip regexes should NOT be used,
# instead use str.lstrip(), str.rstrip() or str.strip()
# (They are kept for reference purposes to the original toktok.pl code)
LSTRIP = re.compile(r"^ +"), ""
RSTRIP = re.compile(r"\s+$"), "\n"
# Merge multiple spaces.
ONE_SPACE = re.compile(r" {2,}"), " "
TOKTOK_REGEXES = [
NON_BREAKING,
FUNKY_PUNCT_1,
FUNKY_PUNCT_2,
URL_FOE_1,
URL_FOE_2,
URL_FOE_3,
URL_FOE_4,
AMPERCENT,
TAB,
PIPE,
OPEN_PUNCT_RE,
CLOSE_PUNCT_RE,
MULTI_COMMAS,
COMMA_IN_NUM,
PROB_SINGLE_QUOTES,
STUPID_QUOTES_1,
STUPID_QUOTES_2,
CURRENCY_SYM_RE,
EN_EM_DASHES,
MULTI_DASHES,
MULTI_DOTS,
FINAL_PERIOD_1,
FINAL_PERIOD_2,
ONE_SPACE,
]
def tokenize(self, text, return_str=False):
text = str(text) # Converts input string into unicode.
for regexp, substitution in self.TOKTOK_REGEXES:
text = regexp.sub(substitution, text)
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
text = str(text.strip())
return text if return_str else text.split()

View File

@@ -0,0 +1,402 @@
# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Edward Loper <edloper@gmail.com>
# Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
# Tom Aarsen <> (modifications)
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
r"""
Penn Treebank Tokenizer
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
"""
import re
import warnings
from typing import Iterator, List, Tuple
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.destructive import MacIntyreContractions
from nltk.tokenize.util import align_tokens
class TreebankWordTokenizer(TokenizerI):
r"""
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This tokenizer performs the following steps:
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
>>> s = "They'll save and invest more."
>>> TreebankWordTokenizer().tokenize(s)
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
>>> s = "hi, my name can't hello,"
>>> TreebankWordTokenizer().tokenize(s)
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
"""
# starting quotes
STARTING_QUOTES = [
(re.compile(r"^\""), r"``"),
(re.compile(r"(``)"), r" \1 "),
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "),
]
# punctuation
PUNCTUATION = [
(re.compile(r"([:,])([^\d])"), r" \1 \2"),
(re.compile(r"([:,])$"), r" \1 "),
(re.compile(r"\.\.\."), r" ... "),
(re.compile(r"[;@#$%&]"), r" \g<0> "),
(
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'),
r"\1 \2\3 ",
), # Handles the final period.
(re.compile(r"[?!]"), r" \g<0> "),
(re.compile(r"([^'])' "), r"\1 ' "),
]
# Pads parentheses
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ")
# Optionally: Convert parentheses, brackets and converts them to PTB symbols.
CONVERT_PARENTHESES = [
(re.compile(r"\("), "-LRB-"),
(re.compile(r"\)"), "-RRB-"),
(re.compile(r"\["), "-LSB-"),
(re.compile(r"\]"), "-RSB-"),
(re.compile(r"\{"), "-LCB-"),
(re.compile(r"\}"), "-RCB-"),
]
DOUBLE_DASHES = (re.compile(r"--"), r" -- ")
# ending quotes
ENDING_QUOTES = [
(re.compile(r"''"), " '' "),
(re.compile(r'"'), " '' "),
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "),
(re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "),
]
# List of contractions adapted from Robert MacIntyre's tokenizer.
_contractions = MacIntyreContractions()
CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2))
CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3))
def tokenize(
self, text: str, convert_parentheses: bool = False, return_str: bool = False
) -> List[str]:
r"""Return a tokenized copy of `text`.
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
"""
if return_str is not False:
warnings.warn(
"Parameter 'return_str' has been deprecated and should no "
"longer be used.",
category=DeprecationWarning,
stacklevel=2,
)
for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)
for regexp, substitution in self.PUNCTUATION:
text = regexp.sub(substitution, text)
# Handles parentheses.
regexp, substitution = self.PARENS_BRACKETS
text = regexp.sub(substitution, text)
# Optionally convert parentheses
if convert_parentheses:
for regexp, substitution in self.CONVERT_PARENTHESES:
text = regexp.sub(substitution, text)
# Handles double dash.
regexp, substitution = self.DOUBLE_DASHES
text = regexp.sub(substitution, text)
# add extra space to make things easier
text = " " + text + " "
for regexp, substitution in self.ENDING_QUOTES:
text = regexp.sub(substitution, text)
for regexp in self.CONTRACTIONS2:
text = regexp.sub(r" \1 \2 ", text)
for regexp in self.CONTRACTIONS3:
text = regexp.sub(r" \1 \2 ", text)
# We are not using CONTRACTIONS4 since
# they are also commented out in the SED scripts
# for regexp in self._contractions.CONTRACTIONS4:
# text = regexp.sub(r' \1 \2 \3 ', text)
return text.split()
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]:
r"""
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.
>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
>>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
True
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
>>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
True
:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
"""
raw_tokens = self.tokenize(text)
# Convert converted quotes back to original double quotes
# Do this only if original text contains double quote(s) or double
# single-quotes (because '' might be transformed to `` if it is
# treated as starting quotes).
if ('"' in text) or ("''" in text):
# Find double quotes and converted quotes
matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)]
# Replace converted quotes back to double quotes
tokens = [
matched.pop(0) if tok in ['"', "``", "''"] else tok
for tok in raw_tokens
]
else:
tokens = raw_tokens
yield from align_tokens(tokens, text)
class TreebankWordDetokenizer(TokenizerI):
r"""
The Treebank detokenizer uses the reverse regex operations corresponding to
the Treebank tokenizer's regexes.
Note:
- There're additional assumption mades when undoing the padding of ``[;@#$%&]``
punctuation symbols that isn't presupposed in the TreebankTokenizer.
- There're additional regexes added in reversing the parentheses tokenization,
such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
padding added to the closing parentheses precedding ``[:;,.]``.
- It's not possible to return the original whitespaces as they were because
there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
the text.split() operation.
>>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.'''
>>> d = TreebankWordDetokenizer()
>>> t = TreebankWordTokenizer()
>>> toks = t.tokenize(s)
>>> d.detokenize(toks)
'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'
The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
parameter:
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
>>> expected_tokens == t.tokenize(s, convert_parentheses=True)
True
>>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
>>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
True
During tokenization it's safe to add more spaces but during detokenization,
simply undoing the padding doesn't really help.
- During tokenization, left and right pad is added to ``[!?]``, when
detokenizing, only left shift the ``[!?]`` is needed.
Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.
- During tokenization ``[:,]`` are left and right padded but when detokenizing,
only left shift is necessary and we keep right pad after comma/colon
if the string after is a non-digit.
Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.
>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
>>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
>>> twd = TreebankWordDetokenizer()
>>> twd.detokenize(toks)
"hello, i can't feel my feet! Help!!"
>>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
>>> twd.detokenize(toks)
"hello, i can't feel; my feet! Help!! He said: Help, help?!"
"""
_contractions = MacIntyreContractions()
CONTRACTIONS2 = [
re.compile(pattern.replace("(?#X)", r"\s"))
for pattern in _contractions.CONTRACTIONS2
]
CONTRACTIONS3 = [
re.compile(pattern.replace("(?#X)", r"\s"))
for pattern in _contractions.CONTRACTIONS3
]
# ending quotes
ENDING_QUOTES = [
(re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "),
(re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "),
(re.compile(r"(\S)\s(\'\')"), r"\1\2"),
(
re.compile(r"(\'\')\s([.,:)\]>};%])"),
r"\1\2",
), # Quotes followed by no-left-padded punctuations.
(re.compile(r"''"), '"'),
]
# Handles double dashes
DOUBLE_DASHES = (re.compile(r" -- "), r"--")
# Optionally: Convert parentheses, brackets and converts them from PTB symbols.
CONVERT_PARENTHESES = [
(re.compile("-LRB-"), "("),
(re.compile("-RRB-"), ")"),
(re.compile("-LSB-"), "["),
(re.compile("-RSB-"), "]"),
(re.compile("-LCB-"), "{"),
(re.compile("-RCB-"), "}"),
]
# Undo padding on parentheses.
PARENS_BRACKETS = [
(re.compile(r"([\[\(\{\<])\s"), r"\g<1>"),
(re.compile(r"\s([\]\)\}\>])"), r"\g<1>"),
(re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"),
]
# punctuation
PUNCTUATION = [
(re.compile(r"([^'])\s'\s"), r"\1' "),
(re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!]
# (re.compile(r'\s([?!])\s'), r'\g<1>'),
(re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"),
# When tokenizing, [;@#$%&] are padded with whitespace regardless of
# whether there are spaces before or after them.
# But during detokenization, we need to distinguish between left/right
# pad, so we split this up.
(re.compile(r"([#$])\s"), r"\g<1>"), # Left pad.
(re.compile(r"\s([;%])"), r"\g<1>"), # Right pad.
# (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad.
(re.compile(r"\s\.\.\.\s"), r"..."),
# (re.compile(r"\s([:,])\s$"), r"\1"), # .strip() takes care of it.
(
re.compile(r"\s([:,])"),
r"\1",
), # Just remove left padding. Punctuation in numbers won't be padded.
]
# starting quotes
STARTING_QUOTES = [
(re.compile(r"([ (\[{<])\s``"), r"\1``"),
(re.compile(r"(``)\s"), r"\1"),
(re.compile(r"``"), r'"'),
]
def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
"""
Treebank detokenizer, created by undoing the regexes from
the TreebankWordTokenizer.tokenize.
:param tokens: A list of strings, i.e. tokenized text.
:type tokens: List[str]
:param convert_parentheses: if True, replace PTB symbols with parentheses,
e.g. `-LRB-` to `(`. Defaults to False.
:type convert_parentheses: bool, optional
:return: str
"""
text = " ".join(tokens)
# Add extra space to make things easier
text = " " + text + " "
# Reverse the contractions regexes.
# Note: CONTRACTIONS4 are not used in tokenization.
for regexp in self.CONTRACTIONS3:
text = regexp.sub(r"\1\2", text)
for regexp in self.CONTRACTIONS2:
text = regexp.sub(r"\1\2", text)
# Reverse the regexes applied for ending quotes.
for regexp, substitution in self.ENDING_QUOTES:
text = regexp.sub(substitution, text)
# Undo the space padding.
text = text.strip()
# Reverse the padding on double dashes.
regexp, substitution = self.DOUBLE_DASHES
text = regexp.sub(substitution, text)
if convert_parentheses:
for regexp, substitution in self.CONVERT_PARENTHESES:
text = regexp.sub(substitution, text)
# Reverse the padding regexes applied for parenthesis/brackets.
for regexp, substitution in self.PARENS_BRACKETS:
text = regexp.sub(substitution, text)
# Reverse the regexes applied for punctuations.
for regexp, substitution in self.PUNCTUATION:
text = regexp.sub(substitution, text)
# Reverse the regexes applied for starting quotes.
for regexp, substitution in self.STARTING_QUOTES:
text = regexp.sub(substitution, text)
return text.strip()
def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str:
"""Duck-typing the abstract *tokenize()*."""
return self.tokenize(tokens, convert_parentheses)

View File

@@ -0,0 +1,295 @@
# Natural Language Toolkit: Tokenizer Utilities
#
# Copyright (C) 2001-2025 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
from re import finditer
from xml.sax.saxutils import escape, unescape
def string_span_tokenize(s, sep):
r"""
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
tuples, by splitting the string at each occurrence of *sep*.
>>> from nltk.tokenize.util import string_span_tokenize
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
... two of them.\n\nThanks.'''
>>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE
[(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37),
(38, 44), (45, 48), (49, 55), (56, 58), (59, 73)]
:param s: the string to be tokenized
:type s: str
:param sep: the token separator
:type sep: str
:rtype: iter(tuple(int, int))
"""
if len(sep) == 0:
raise ValueError("Token delimiter must not be empty")
left = 0
while True:
try:
right = s.index(sep, left)
if right != 0:
yield left, right
except ValueError:
if left != len(s):
yield left, len(s)
break
left = right + len(sep)
def regexp_span_tokenize(s, regexp):
r"""
Return the offsets of the tokens in *s*, as a sequence of ``(start, end)``
tuples, by splitting the string at each successive match of *regexp*.
>>> from nltk.tokenize.util import regexp_span_tokenize
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
... two of them.\n\nThanks.'''
>>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE
[(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36),
(38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]
:param s: the string to be tokenized
:type s: str
:param regexp: regular expression that matches token separators (must not be empty)
:type regexp: str
:rtype: iter(tuple(int, int))
"""
left = 0
for m in finditer(regexp, s):
right, next = m.span()
if right != left:
yield left, right
left = next
yield left, len(s)
def spans_to_relative(spans):
r"""
Return a sequence of relative spans, given a sequence of spans.
>>> from nltk.tokenize import WhitespaceTokenizer
>>> from nltk.tokenize.util import spans_to_relative
>>> s = '''Good muffins cost $3.88\nin New York. Please buy me
... two of them.\n\nThanks.'''
>>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE
[(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6),
(1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)]
:param spans: a sequence of (start, end) offsets of the tokens
:type spans: iter(tuple(int, int))
:rtype: iter(tuple(int, int))
"""
prev = 0
for left, right in spans:
yield left - prev, right - left
prev = right
class CJKChars:
"""
An object that enumerates the code points of the CJK characters as listed on
https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
This is a Python port of the CJK code point enumerations of Moses tokenizer:
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309
"""
# Hangul Jamo (110011FF)
Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff"))
# CJK Radicals Supplement (2E802EFF)
# Kangxi Radicals (2F002FDF)
# Ideographic Description Characters (2FF02FFF)
# CJK Symbols and Punctuation (3000303F)
# Hiragana (3040309F)
# Katakana (30A030FF)
# Bopomofo (3100312F)
# Hangul Compatibility Jamo (3130318F)
# Kanbun (3190319F)
# Bopomofo Extended (31A031BF)
# CJK Strokes (31C031EF)
# Katakana Phonetic Extensions (31F031FF)
# Enclosed CJK Letters and Months (320032FF)
# CJK Compatibility (330033FF)
# CJK Unified Ideographs Extension A (34004DBF)
# Yijing Hexagram Symbols (4DC04DFF)
# CJK Unified Ideographs (4E009FFF)
# Yi Syllables (A000A48F)
# Yi Radicals (A490A4CF)
CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf"))
# Phags-pa (A840A87F)
Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f"))
# Hangul Syllables (AC00D7AF)
Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF"))
# CJK Compatibility Ideographs (F900FAFF)
CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF"))
# CJK Compatibility Forms (FE30FE4F)
CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F"))
# Range U+FF65FFDC encodes halfwidth forms, of Katakana and Hangul characters
Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC"))
# Supplementary Ideographic Plane 200002FFFF
Supplementary_Ideographic_Plane = (
131072,
196607,
) # (ord(u"\U00020000"), ord(u"\U0002FFFF"))
ranges = [
Hangul_Jamo,
CJK_Radicals,
Phags_Pa,
Hangul_Syllables,
CJK_Compatibility_Ideographs,
CJK_Compatibility_Forms,
Katakana_Hangul_Halfwidth,
Supplementary_Ideographic_Plane,
]
def is_cjk(character):
"""
Python port of Moses' code to check for CJK character.
>>> CJKChars().ranges
[(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)]
>>> is_cjk(u'\u33fe')
True
>>> is_cjk(u'\uFE5F')
False
:param character: The character that needs to be checked.
:type character: char
:return: bool
"""
return any(
[
start <= ord(character) <= end
for start, end in [
(4352, 4607),
(11904, 42191),
(43072, 43135),
(44032, 55215),
(63744, 64255),
(65072, 65103),
(65381, 65500),
(131072, 196607),
]
]
)
def xml_escape(text):
"""
This function transforms the input text into an "escaped" version suitable
for well-formed XML formatting.
Note that the default xml.sax.saxutils.escape() function don't escape
some characters that Moses does so we have to manually add them to the
entities dictionary.
>>> input_str = ''')| & < > ' " ] ['''
>>> expected_output = ''')| &amp; &lt; &gt; ' " ] ['''
>>> escape(input_str) == expected_output
True
>>> xml_escape(input_str)
')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
:param text: The text that needs to be escaped.
:type text: str
:rtype: str
"""
return escape(
text,
entities={
r"'": r"&apos;",
r'"': r"&quot;",
r"|": r"&#124;",
r"[": r"&#91;",
r"]": r"&#93;",
},
)
def xml_unescape(text):
"""
This function transforms the "escaped" version suitable
for well-formed XML formatting into humanly-readable string.
Note that the default xml.sax.saxutils.unescape() function don't unescape
some characters that Moses does so we have to manually add them to the
entities dictionary.
>>> from xml.sax.saxutils import unescape
>>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
>>> expected = ''')| & < > \' " ] ['''
>>> xml_unescape(s) == expected
True
:param text: The text that needs to be unescaped.
:type text: str
:rtype: str
"""
return unescape(
text,
entities={
r"&apos;": r"'",
r"&quot;": r'"',
r"&#124;": r"|",
r"&#91;": r"[",
r"&#93;": r"]",
},
)
def align_tokens(tokens, sentence):
"""
This module attempt to find the offsets of the tokens in *s*, as a sequence
of ``(start, end)`` tuples, given the tokens and also the source string.
>>> from nltk.tokenize import TreebankWordTokenizer
>>> from nltk.tokenize.util import align_tokens
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's "
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh "
... "on Saturday.")
>>> tokens = TreebankWordTokenizer().tokenize(s)
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23),
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54),
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89),
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122),
... (123, 131), (131, 132)]
>>> output = list(align_tokens(tokens, s))
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same.
True
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected.
True
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens.
True
:param tokens: The list of strings that are the result of tokenization
:type tokens: list(str)
:param sentence: The original string
:type sentence: str
:rtype: list(tuple(int,int))
"""
point = 0
offsets = []
for token in tokens:
try:
start = sentence.index(token, point)
except ValueError as e:
raise ValueError(f'substring "{token}" not found in "{sentence}"') from e
point = start + len(token)
offsets.append((start, point))
return offsets