updates
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
# Natural Language Toolkit: Simple Tokenizers
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# Steven Bird <stevenbird1@gmail.com>
|
||||
# URL: <https://www.nltk.org>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
r"""
|
||||
Simple Tokenizers
|
||||
|
||||
These tokenizers divide strings into substrings using the string
|
||||
``split()`` method.
|
||||
When tokenizing using a particular delimiter string, use
|
||||
the string ``split()`` method directly, as this is more efficient.
|
||||
|
||||
The simple tokenizers are *not* available as separate functions;
|
||||
instead, you should just use the string ``split()`` method directly:
|
||||
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> s.split() # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
|
||||
>>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
|
||||
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
|
||||
>>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', '', 'Thanks.']
|
||||
|
||||
The simple tokenizers are mainly useful because they follow the
|
||||
standard ``TokenizerI`` interface, and so can be used with any code
|
||||
that expects a tokenizer. For example, these tokenizers can be used
|
||||
to specify the tokenization conventions when building a `CorpusReader`.
|
||||
|
||||
"""
|
||||
|
||||
from nltk.tokenize.api import StringTokenizer, TokenizerI
|
||||
from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize
|
||||
|
||||
|
||||
class SpaceTokenizer(StringTokenizer):
|
||||
r"""Tokenize a string using the space character as a delimiter,
|
||||
which is the same as ``s.split(' ')``.
|
||||
|
||||
>>> from nltk.tokenize import SpaceTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
|
||||
'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
|
||||
"""
|
||||
|
||||
_string = " "
|
||||
|
||||
|
||||
class TabTokenizer(StringTokenizer):
|
||||
r"""Tokenize a string use the tab character as a delimiter,
|
||||
the same as ``s.split('\t')``.
|
||||
|
||||
>>> from nltk.tokenize import TabTokenizer
|
||||
>>> TabTokenizer().tokenize('a\tb c\n\t d')
|
||||
['a', 'b c\n', ' d']
|
||||
"""
|
||||
|
||||
_string = "\t"
|
||||
|
||||
|
||||
class CharTokenizer(StringTokenizer):
|
||||
"""Tokenize a string into individual characters. If this functionality
|
||||
is ever required directly, use ``for char in string``.
|
||||
"""
|
||||
|
||||
_string = None
|
||||
|
||||
def tokenize(self, s):
|
||||
return list(s)
|
||||
|
||||
def span_tokenize(self, s):
|
||||
yield from enumerate(range(1, len(s) + 1))
|
||||
|
||||
|
||||
class LineTokenizer(TokenizerI):
|
||||
r"""Tokenize a string into its lines, optionally discarding blank lines.
|
||||
This is similar to ``s.split('\n')``.
|
||||
|
||||
>>> from nltk.tokenize import LineTokenizer
|
||||
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks."
|
||||
>>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', '', 'Thanks.']
|
||||
>>> # same as [l for l in s.split('\n') if l.strip()]:
|
||||
>>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
|
||||
['Good muffins cost $3.88', 'in New York. Please buy me',
|
||||
'two of them.', 'Thanks.']
|
||||
|
||||
:param blanklines: Indicates how blank lines should be handled. Valid values are:
|
||||
|
||||
- ``discard``: strip blank lines out of the token list before returning it.
|
||||
A line is considered blank if it contains only whitespace characters.
|
||||
- ``keep``: leave all blank lines in the token list.
|
||||
- ``discard-eof``: if the string ends with a newline, then do not generate
|
||||
a corresponding token ``''`` after that newline.
|
||||
"""
|
||||
|
||||
def __init__(self, blanklines="discard"):
|
||||
valid_blanklines = ("discard", "keep", "discard-eof")
|
||||
if blanklines not in valid_blanklines:
|
||||
raise ValueError(
|
||||
"Blank lines must be one of: %s" % " ".join(valid_blanklines)
|
||||
)
|
||||
|
||||
self._blanklines = blanklines
|
||||
|
||||
def tokenize(self, s):
|
||||
lines = s.splitlines()
|
||||
# If requested, strip off blank lines.
|
||||
if self._blanklines == "discard":
|
||||
lines = [l for l in lines if l.rstrip()]
|
||||
elif self._blanklines == "discard-eof":
|
||||
if lines and not lines[-1].strip():
|
||||
lines.pop()
|
||||
return lines
|
||||
|
||||
# discard-eof not implemented
|
||||
def span_tokenize(self, s):
|
||||
if self._blanklines == "keep":
|
||||
yield from string_span_tokenize(s, r"\n")
|
||||
else:
|
||||
yield from regexp_span_tokenize(s, r"\n(\s+\n)*")
|
||||
|
||||
|
||||
######################################################################
|
||||
# { Tokenization Functions
|
||||
######################################################################
|
||||
# XXX: it is stated in module docs that there is no function versions
|
||||
|
||||
|
||||
def line_tokenize(text, blanklines="discard"):
|
||||
return LineTokenizer(blanklines).tokenize(text)
|
||||
Reference in New Issue
Block a user