updates
This commit is contained in:
175
Backend/venv/lib/python3.12/site-packages/nltk/classify/senna.py
Normal file
175
Backend/venv/lib/python3.12/site-packages/nltk/classify/senna.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# Natural Language Toolkit: Senna Interface
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
A general interface to the SENNA pipeline that supports any of the
|
||||
operations specified in SUPPORTED_OPERATIONS.
|
||||
|
||||
Applying multiple operations at once has the speed advantage. For example,
|
||||
Senna will automatically determine POS tags if you are extracting named
|
||||
entities. Applying both of the operations will cost only the time of
|
||||
extracting the named entities.
|
||||
|
||||
The SENNA pipeline has a fixed maximum size of the sentences that it can read.
|
||||
By default it is 1024 token/sentence. If you have larger sentences, changing
|
||||
the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your
|
||||
system specific binary should be rebuilt. Otherwise this could introduce
|
||||
misalignment errors.
|
||||
|
||||
The input is:
|
||||
|
||||
- path to the directory that contains SENNA executables. If the path is incorrect,
|
||||
Senna will automatically search for executable file specified in SENNA environment variable
|
||||
- List of the operations needed to be performed.
|
||||
- (optionally) the encoding of the input data (default:utf-8)
|
||||
|
||||
Note: Unit tests for this module can be found in test/unit/test_senna.py
|
||||
|
||||
>>> from nltk.classify import Senna
|
||||
>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP
|
||||
>>> sent = 'Dusseldorf is an international business center'.split()
|
||||
>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP
|
||||
[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'),
|
||||
('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')]
|
||||
"""
|
||||
|
||||
from os import environ, path, sep
|
||||
from platform import architecture, system
|
||||
from subprocess import PIPE, Popen
|
||||
|
||||
from nltk.tag.api import TaggerI
|
||||
|
||||
|
||||
class Senna(TaggerI):
|
||||
SUPPORTED_OPERATIONS = ["pos", "chk", "ner"]
|
||||
|
||||
def __init__(self, senna_path, operations, encoding="utf-8"):
|
||||
self._encoding = encoding
|
||||
self._path = path.normpath(senna_path) + sep
|
||||
|
||||
# Verifies the existence of the executable on the self._path first
|
||||
# senna_binary_file_1 = self.executable(self._path)
|
||||
exe_file_1 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_1):
|
||||
# Check for the system environment
|
||||
if "SENNA" in environ:
|
||||
# self._path = path.join(environ['SENNA'],'')
|
||||
self._path = path.normpath(environ["SENNA"]) + sep
|
||||
exe_file_2 = self.executable(self._path)
|
||||
if not path.isfile(exe_file_2):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s or %s but not found"
|
||||
% (exe_file_1, exe_file_2)
|
||||
)
|
||||
|
||||
self.operations = operations
|
||||
|
||||
def executable(self, base_path):
|
||||
"""
|
||||
The function that determines the system specific binary that should be
|
||||
used in the pipeline. In case, the system is not known the default senna binary will
|
||||
be used.
|
||||
"""
|
||||
os_name = system()
|
||||
if os_name == "Linux":
|
||||
bits = architecture()[0]
|
||||
if bits == "64bit":
|
||||
return path.join(base_path, "senna-linux64")
|
||||
return path.join(base_path, "senna-linux32")
|
||||
if os_name == "Windows":
|
||||
return path.join(base_path, "senna-win32.exe")
|
||||
if os_name == "Darwin":
|
||||
return path.join(base_path, "senna-osx")
|
||||
return path.join(base_path, "senna")
|
||||
|
||||
def _map(self):
|
||||
"""
|
||||
A method that calculates the order of the columns that SENNA pipeline
|
||||
will output the tags into. This depends on the operations being ordered.
|
||||
"""
|
||||
_map = {}
|
||||
i = 1
|
||||
for operation in Senna.SUPPORTED_OPERATIONS:
|
||||
if operation in self.operations:
|
||||
_map[operation] = i
|
||||
i += 1
|
||||
return _map
|
||||
|
||||
def tag(self, tokens):
|
||||
"""
|
||||
Applies the specified operation(s) on a list of tokens.
|
||||
"""
|
||||
return self.tag_sents([tokens])[0]
|
||||
|
||||
def tag_sents(self, sentences):
|
||||
"""
|
||||
Applies the tag method over a list of sentences. This method will return a
|
||||
list of dictionaries. Every dictionary will contain a word with its
|
||||
calculated annotations/tags.
|
||||
"""
|
||||
encoding = self._encoding
|
||||
|
||||
if not path.isfile(self.executable(self._path)):
|
||||
raise LookupError(
|
||||
"Senna executable expected at %s but not found"
|
||||
% self.executable(self._path)
|
||||
)
|
||||
|
||||
# Build the senna command to run the tagger
|
||||
_senna_cmd = [
|
||||
self.executable(self._path),
|
||||
"-path",
|
||||
self._path,
|
||||
"-usrtokens",
|
||||
"-iobtags",
|
||||
]
|
||||
_senna_cmd.extend(["-" + op for op in self.operations])
|
||||
|
||||
# Serialize the actual sentences to a temporary string
|
||||
_input = "\n".join(" ".join(x) for x in sentences) + "\n"
|
||||
if isinstance(_input, str) and encoding:
|
||||
_input = _input.encode(encoding)
|
||||
|
||||
# Run the tagger and get the output
|
||||
p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
||||
(stdout, stderr) = p.communicate(input=_input)
|
||||
senna_output = stdout
|
||||
|
||||
# Check the return code.
|
||||
if p.returncode != 0:
|
||||
raise RuntimeError("Senna command failed! Details: %s" % stderr)
|
||||
|
||||
if encoding:
|
||||
senna_output = stdout.decode(encoding)
|
||||
|
||||
# Output the tagged sentences
|
||||
map_ = self._map()
|
||||
tagged_sentences = [[]]
|
||||
sentence_index = 0
|
||||
token_index = 0
|
||||
for tagged_word in senna_output.strip().split("\n"):
|
||||
if not tagged_word:
|
||||
tagged_sentences.append([])
|
||||
sentence_index += 1
|
||||
token_index = 0
|
||||
continue
|
||||
tags = tagged_word.split("\t")
|
||||
result = {}
|
||||
for tag in map_:
|
||||
result[tag] = tags[map_[tag]].strip()
|
||||
try:
|
||||
result["word"] = sentences[sentence_index][token_index]
|
||||
except IndexError as e:
|
||||
raise IndexError(
|
||||
"Misalignment error occurred at sentence number %d. Possible reason"
|
||||
" is that the sentence size exceeded the maximum size. Check the "
|
||||
"documentation of Senna class for more information."
|
||||
% sentence_index
|
||||
) from e
|
||||
tagged_sentences[-1].append(result)
|
||||
token_index += 1
|
||||
return tagged_sentences
|
||||
Reference in New Issue
Block a user