updates
This commit is contained in:
@@ -0,0 +1,18 @@
|
||||
# Natural Language Toolkit: Unit Tests
|
||||
#
|
||||
# Copyright (C) 2001-2025 NLTK Project
|
||||
# Author: Edward Loper <edloper@gmail.com>
|
||||
# URL: <https://www.nltk.org/>
|
||||
# For license information, see LICENSE.TXT
|
||||
|
||||
"""
|
||||
Unit tests for the NLTK modules. These tests are intended to ensure
|
||||
that source code changes don't accidentally introduce bugs.
|
||||
For instructions, please see:
|
||||
|
||||
../../web/dev/local_testing.rst
|
||||
|
||||
https://github.com/nltk/nltk/blob/develop/web/dev/local_testing.rst
|
||||
|
||||
|
||||
"""
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
26
Backend/venv/lib/python3.12/site-packages/nltk/test/all.py
Normal file
26
Backend/venv/lib/python3.12/site-packages/nltk/test/all.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""Test suite that runs all NLTK tests.
|
||||
|
||||
This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the
|
||||
project's ``setup-eggs.py`` file. Here, we create a test suite that
|
||||
runs all of our doctests, and return it for processing by the setuptools
|
||||
test harness.
|
||||
|
||||
"""
|
||||
|
||||
import doctest
|
||||
import os.path
|
||||
import unittest
|
||||
from glob import glob
|
||||
|
||||
|
||||
def additional_tests():
|
||||
# print("here-000000000000000")
|
||||
# print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest')))
|
||||
dir = os.path.dirname(__file__)
|
||||
paths = glob(os.path.join(dir, "*.doctest"))
|
||||
files = [os.path.basename(path) for path in paths]
|
||||
return unittest.TestSuite([doctest.DocFileSuite(file) for file in files])
|
||||
|
||||
|
||||
# if os.path.split(path)[-1] != 'index.rst'
|
||||
# skips time-dependent doctest in index.rst
|
||||
@@ -0,0 +1,29 @@
|
||||
==========
|
||||
BLEU tests
|
||||
==========
|
||||
|
||||
>>> from nltk.translate import bleu
|
||||
|
||||
If the candidate has no alignment to any of the references, the BLEU score is 0.
|
||||
|
||||
>>> bleu(
|
||||
... ['The candidate has no alignment to any of the references'.split()],
|
||||
... 'John loves Mary'.split(),
|
||||
... (1,),
|
||||
... )
|
||||
0
|
||||
|
||||
This is an implementation of the smoothing techniques
|
||||
for segment-level BLEU scores that was presented in
|
||||
Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
|
||||
Smoothing Techniques for Sentence-Level BLEU. In WMT14.
|
||||
http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
|
||||
>>> from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction
|
||||
|
||||
|
||||
>>> sentence_bleu(
|
||||
... ['It is a place of quiet contemplation .'.split()],
|
||||
... 'It is .'.split(),
|
||||
... smoothing_function=SmoothingFunction().method4,
|
||||
... )*100
|
||||
4.4267...
|
||||
@@ -0,0 +1,60 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
>>> import os.path
|
||||
|
||||
>>> from nltk.corpus.reader import BNCCorpusReader
|
||||
>>> import nltk.test
|
||||
|
||||
>>> root = os.path.dirname(nltk.test.__file__)
|
||||
>>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml')
|
||||
|
||||
Checking the word access.
|
||||
-------------------------
|
||||
|
||||
>>> len(bnc.words())
|
||||
151
|
||||
|
||||
>>> bnc.words()[:6]
|
||||
['Ah', 'there', 'we', 'are', ',', '.']
|
||||
>>> bnc.words(stem=True)[:6]
|
||||
['ah', 'there', 'we', 'be', ',', '.']
|
||||
|
||||
>>> bnc.tagged_words()[:6]
|
||||
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
|
||||
|
||||
>>> bnc.tagged_words(c5=True)[:6]
|
||||
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
|
||||
|
||||
Testing access to the sentences.
|
||||
--------------------------------
|
||||
|
||||
>>> len(bnc.sents())
|
||||
15
|
||||
|
||||
>>> bnc.sents()[0]
|
||||
['Ah', 'there', 'we', 'are', ',', '.']
|
||||
>>> bnc.sents(stem=True)[0]
|
||||
['ah', 'there', 'we', 'be', ',', '.']
|
||||
|
||||
>>> bnc.tagged_sents()[0]
|
||||
[('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')]
|
||||
>>> bnc.tagged_sents(c5=True)[0]
|
||||
[('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')]
|
||||
|
||||
A not lazy loader.
|
||||
------------------
|
||||
|
||||
>>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False)
|
||||
|
||||
>>> len(eager.words())
|
||||
151
|
||||
>>> eager.words(stem=True)[6:17]
|
||||
['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.']
|
||||
|
||||
>>> eager.tagged_words()[6:11]
|
||||
[('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')]
|
||||
>>> eager.tagged_words(c5=True)[6:17]
|
||||
[('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')]
|
||||
>>> len(eager.sents())
|
||||
15
|
||||
376
Backend/venv/lib/python3.12/site-packages/nltk/test/ccg.doctest
Normal file
376
Backend/venv/lib/python3.12/site-packages/nltk/test/ccg.doctest
Normal file
@@ -0,0 +1,376 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==============================
|
||||
Combinatory Categorial Grammar
|
||||
==============================
|
||||
|
||||
Relative Clauses
|
||||
----------------
|
||||
|
||||
>>> from nltk.ccg import chart, lexicon
|
||||
|
||||
Construct a lexicon:
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, NP, N, VP
|
||||
...
|
||||
... Det :: NP/N
|
||||
... Pro :: NP
|
||||
... Modal :: S\\NP/VP
|
||||
...
|
||||
... TV :: VP/NP
|
||||
... DTV :: TV/NP
|
||||
...
|
||||
... the => Det
|
||||
...
|
||||
... that => Det
|
||||
... that => NP
|
||||
...
|
||||
... I => Pro
|
||||
... you => Pro
|
||||
... we => Pro
|
||||
...
|
||||
... chef => N
|
||||
... cake => N
|
||||
... children => N
|
||||
... dough => N
|
||||
...
|
||||
... will => Modal
|
||||
... should => Modal
|
||||
... might => Modal
|
||||
... must => Modal
|
||||
...
|
||||
... and => var\\.,var/.,var
|
||||
...
|
||||
... to => VP[to]/VP
|
||||
...
|
||||
... without => (VP\\VP)/VP[ing]
|
||||
...
|
||||
... be => TV
|
||||
... cook => TV
|
||||
... eat => TV
|
||||
...
|
||||
... cooking => VP[ing]/NP
|
||||
...
|
||||
... give => DTV
|
||||
...
|
||||
... is => (S\\NP)/NP
|
||||
... prefer => (S\\NP)/NP
|
||||
...
|
||||
... which => (N\\N)/(S/NP)
|
||||
...
|
||||
... persuade => (VP/VP[to])/NP
|
||||
... ''')
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> for parse in parser.parse("you prefer that cake".split()):
|
||||
... chart.printCCGDerivation(parse)
|
||||
... break
|
||||
...
|
||||
you prefer that cake
|
||||
NP ((S\NP)/NP) (NP/N) N
|
||||
-------------->
|
||||
NP
|
||||
--------------------------->
|
||||
(S\NP)
|
||||
--------------------------------<
|
||||
S
|
||||
|
||||
>>> for parse in parser.parse("that is the cake which you prefer".split()):
|
||||
... chart.printCCGDerivation(parse)
|
||||
... break
|
||||
...
|
||||
that is the cake which you prefer
|
||||
NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/NP)
|
||||
----->T
|
||||
(S/(S\NP))
|
||||
------------------>B
|
||||
(S/NP)
|
||||
---------------------------------->
|
||||
(N\N)
|
||||
----------------------------------------<
|
||||
N
|
||||
------------------------------------------------>
|
||||
NP
|
||||
------------------------------------------------------------->
|
||||
(S\NP)
|
||||
-------------------------------------------------------------------<
|
||||
S
|
||||
|
||||
|
||||
Some other sentences to try:
|
||||
"that is the cake which we will persuade the chef to cook"
|
||||
"that is the cake which we will persuade the chef to give the children"
|
||||
|
||||
>>> sent = "that is the dough which you will eat without cooking".split()
|
||||
>>> nosub_parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet +
|
||||
... chart.CompositionRuleSet + chart.TypeRaiseRuleSet)
|
||||
|
||||
Without Substitution (no output)
|
||||
|
||||
>>> for parse in nosub_parser.parse(sent):
|
||||
... chart.printCCGDerivation(parse)
|
||||
|
||||
With Substitution:
|
||||
|
||||
>>> for parse in parser.parse(sent):
|
||||
... chart.printCCGDerivation(parse)
|
||||
... break
|
||||
...
|
||||
that is the dough which you will eat without cooking
|
||||
NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/VP) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
|
||||
----->T
|
||||
(S/(S\NP))
|
||||
------------------------------------->B
|
||||
((VP\VP)/NP)
|
||||
----------------------------------------------<Sx
|
||||
(VP/NP)
|
||||
----------------------------------------------------------->B
|
||||
((S\NP)/NP)
|
||||
---------------------------------------------------------------->B
|
||||
(S/NP)
|
||||
-------------------------------------------------------------------------------->
|
||||
(N\N)
|
||||
---------------------------------------------------------------------------------------<
|
||||
N
|
||||
----------------------------------------------------------------------------------------------->
|
||||
NP
|
||||
------------------------------------------------------------------------------------------------------------>
|
||||
(S\NP)
|
||||
------------------------------------------------------------------------------------------------------------------<
|
||||
S
|
||||
|
||||
|
||||
Conjunction
|
||||
-----------
|
||||
|
||||
>>> from nltk.ccg.chart import CCGChartParser, ApplicationRuleSet, CompositionRuleSet
|
||||
>>> from nltk.ccg.chart import SubstitutionRuleSet, TypeRaiseRuleSet, printCCGDerivation
|
||||
>>> from nltk.ccg import lexicon
|
||||
|
||||
Lexicons for the tests:
|
||||
|
||||
>>> test1_lex = '''
|
||||
... :- S,N,NP,VP
|
||||
... I => NP
|
||||
... you => NP
|
||||
... will => S\\NP/VP
|
||||
... cook => VP/NP
|
||||
... which => (N\\N)/(S/NP)
|
||||
... and => var\\.,var/.,var
|
||||
... might => S\\NP/VP
|
||||
... eat => VP/NP
|
||||
... the => NP/N
|
||||
... mushrooms => N
|
||||
... parsnips => N'''
|
||||
>>> test2_lex = '''
|
||||
... :- N, S, NP, VP
|
||||
... articles => N
|
||||
... the => NP/N
|
||||
... and => var\\.,var/.,var
|
||||
... which => (N\\N)/(S/NP)
|
||||
... I => NP
|
||||
... anyone => NP
|
||||
... will => (S/VP)\\NP
|
||||
... file => VP/NP
|
||||
... without => (VP\\VP)/VP[ing]
|
||||
... forget => VP/NP
|
||||
... reading => VP[ing]/NP
|
||||
... '''
|
||||
|
||||
Tests handling of conjunctions.
|
||||
Note that while the two derivations are different, they are semantically equivalent.
|
||||
|
||||
>>> lex = lexicon.fromstring(test1_lex)
|
||||
>>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet)
|
||||
>>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()):
|
||||
... printCCGDerivation(parse)
|
||||
I will cook and might eat the mushrooms and parsnips
|
||||
NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N
|
||||
---------------------->B
|
||||
((S\NP)/NP)
|
||||
---------------------->B
|
||||
((S\NP)/NP)
|
||||
------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP))
|
||||
-----------------------------------------------------------------------<
|
||||
((S\NP)/NP)
|
||||
------------------------------------->
|
||||
(N\.,N)
|
||||
------------------------------------------------<
|
||||
N
|
||||
-------------------------------------------------------->
|
||||
NP
|
||||
------------------------------------------------------------------------------------------------------------------------------->
|
||||
(S\NP)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------<
|
||||
S
|
||||
I will cook and might eat the mushrooms and parsnips
|
||||
NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N
|
||||
---------------------->B
|
||||
((S\NP)/NP)
|
||||
---------------------->B
|
||||
((S\NP)/NP)
|
||||
------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP))
|
||||
-----------------------------------------------------------------------<
|
||||
((S\NP)/NP)
|
||||
------------------------------------------------------------------------------->B
|
||||
((S\NP)/N)
|
||||
------------------------------------->
|
||||
(N\.,N)
|
||||
------------------------------------------------<
|
||||
N
|
||||
------------------------------------------------------------------------------------------------------------------------------->
|
||||
(S\NP)
|
||||
-----------------------------------------------------------------------------------------------------------------------------------<
|
||||
S
|
||||
|
||||
|
||||
Tests handling subject extraction.
|
||||
Interesting to point that the two parses are clearly semantically different.
|
||||
|
||||
>>> lex = lexicon.fromstring(test2_lex)
|
||||
>>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet)
|
||||
>>> for parse in parser.parse("articles which I will file and forget without reading".split()):
|
||||
... printCCGDerivation(parse)
|
||||
articles which I will file and forget without reading
|
||||
N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
|
||||
-----------------<
|
||||
(S/VP)
|
||||
------------------------------------->B
|
||||
((VP\VP)/NP)
|
||||
----------------------------------------------<Sx
|
||||
(VP/NP)
|
||||
------------------------------------------------------------------------->
|
||||
((VP/NP)\.,(VP/NP))
|
||||
----------------------------------------------------------------------------------<
|
||||
(VP/NP)
|
||||
--------------------------------------------------------------------------------------------------->B
|
||||
(S/NP)
|
||||
------------------------------------------------------------------------------------------------------------------->
|
||||
(N\N)
|
||||
-----------------------------------------------------------------------------------------------------------------------------<
|
||||
N
|
||||
articles which I will file and forget without reading
|
||||
N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP)
|
||||
-----------------<
|
||||
(S/VP)
|
||||
------------------------------------>
|
||||
((VP/NP)\.,(VP/NP))
|
||||
---------------------------------------------<
|
||||
(VP/NP)
|
||||
------------------------------------->B
|
||||
((VP\VP)/NP)
|
||||
----------------------------------------------------------------------------------<Sx
|
||||
(VP/NP)
|
||||
--------------------------------------------------------------------------------------------------->B
|
||||
(S/NP)
|
||||
------------------------------------------------------------------------------------------------------------------->
|
||||
(N\N)
|
||||
-----------------------------------------------------------------------------------------------------------------------------<
|
||||
N
|
||||
|
||||
|
||||
Unicode support
|
||||
---------------
|
||||
|
||||
Unicode words are supported.
|
||||
|
||||
>>> from nltk.ccg import chart, lexicon
|
||||
|
||||
Lexicons for the tests:
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, N, NP, PP
|
||||
...
|
||||
... AdjI :: N\\N
|
||||
... AdjD :: N/N
|
||||
... AdvD :: S/S
|
||||
... AdvI :: S\\S
|
||||
... Det :: NP/N
|
||||
... PrepNPCompl :: PP/NP
|
||||
... PrepNAdjN :: S\\S/N
|
||||
... PrepNAdjNP :: S\\S/NP
|
||||
... VPNP :: S\\NP/NP
|
||||
... VPPP :: S\\NP/PP
|
||||
... VPser :: S\\NP/AdjI
|
||||
...
|
||||
... auto => N
|
||||
... bebidas => N
|
||||
... cine => N
|
||||
... ley => N
|
||||
... libro => N
|
||||
... ministro => N
|
||||
... panadería => N
|
||||
... presidente => N
|
||||
... super => N
|
||||
...
|
||||
... el => Det
|
||||
... la => Det
|
||||
... las => Det
|
||||
... un => Det
|
||||
...
|
||||
... Ana => NP
|
||||
... Pablo => NP
|
||||
...
|
||||
... y => var\\.,var/.,var
|
||||
...
|
||||
... pero => (S/NP)\\(S/NP)/(S/NP)
|
||||
...
|
||||
... anunció => VPNP
|
||||
... compró => VPNP
|
||||
... cree => S\\NP/S[dep]
|
||||
... desmintió => VPNP
|
||||
... lee => VPNP
|
||||
... fueron => VPPP
|
||||
...
|
||||
... es => VPser
|
||||
...
|
||||
... interesante => AdjD
|
||||
... interesante => AdjI
|
||||
... nueva => AdjD
|
||||
... nueva => AdjI
|
||||
...
|
||||
... a => PrepNPCompl
|
||||
... en => PrepNAdjN
|
||||
... en => PrepNAdjNP
|
||||
...
|
||||
... ayer => AdvI
|
||||
...
|
||||
... que => (NP\\NP)/(S/NP)
|
||||
... que => S[dep]/S
|
||||
... ''')
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()):
|
||||
... printCCGDerivation(parse) # doctest: +SKIP
|
||||
... # it fails on python2.7 because of the unicode problem explained in https://github.com/nltk/nltk/pull/1354
|
||||
... break
|
||||
el ministro anunció pero el presidente desmintió la nueva ley
|
||||
(NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N
|
||||
------------------>
|
||||
NP
|
||||
------------------>T
|
||||
(S/(S\NP))
|
||||
-------------------->
|
||||
NP
|
||||
-------------------->T
|
||||
(S/(S\NP))
|
||||
--------------------------------->B
|
||||
(S/NP)
|
||||
----------------------------------------------------------->
|
||||
((S/NP)\(S/NP))
|
||||
------------>
|
||||
N
|
||||
-------------------->
|
||||
NP
|
||||
--------------------<T
|
||||
(S\(S/NP))
|
||||
-------------------------------------------------------------------------------<B
|
||||
(S\(S/NP))
|
||||
--------------------------------------------------------------------------------------------<B
|
||||
(S/NP)
|
||||
-------------------------------------------------------------------------------------------------------------->
|
||||
S
|
||||
@@ -0,0 +1,552 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==============================================
|
||||
Combinatory Categorial Grammar with semantics
|
||||
==============================================
|
||||
|
||||
-----
|
||||
Chart
|
||||
-----
|
||||
|
||||
|
||||
>>> from nltk.ccg import chart, lexicon
|
||||
>>> from nltk.ccg.chart import printCCGDerivation
|
||||
|
||||
No semantics
|
||||
-------------------
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, NP, N
|
||||
... She => NP
|
||||
... has => (S\\NP)/NP
|
||||
... books => NP
|
||||
... ''',
|
||||
... False)
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> parses = list(parser.parse("She has books".split()))
|
||||
>>> print(str(len(parses)) + " parses")
|
||||
3 parses
|
||||
|
||||
>>> printCCGDerivation(parses[0])
|
||||
She has books
|
||||
NP ((S\NP)/NP) NP
|
||||
-------------------->
|
||||
(S\NP)
|
||||
-------------------------<
|
||||
S
|
||||
|
||||
>>> printCCGDerivation(parses[1])
|
||||
She has books
|
||||
NP ((S\NP)/NP) NP
|
||||
----->T
|
||||
(S/(S\NP))
|
||||
-------------------->
|
||||
(S\NP)
|
||||
------------------------->
|
||||
S
|
||||
|
||||
|
||||
>>> printCCGDerivation(parses[2])
|
||||
She has books
|
||||
NP ((S\NP)/NP) NP
|
||||
----->T
|
||||
(S/(S\NP))
|
||||
------------------>B
|
||||
(S/NP)
|
||||
------------------------->
|
||||
S
|
||||
|
||||
Simple semantics
|
||||
-------------------
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, NP, N
|
||||
... She => NP {she}
|
||||
... has => (S\\NP)/NP {\\x y.have(y, x)}
|
||||
... a => NP/N {\\P.exists z.P(z)}
|
||||
... book => N {book}
|
||||
... ''',
|
||||
... True)
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> parses = list(parser.parse("She has a book".split()))
|
||||
>>> print(str(len(parses)) + " parses")
|
||||
7 parses
|
||||
|
||||
>>> printCCGDerivation(parses[0])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
------------------------------------->
|
||||
NP {exists z.book(z)}
|
||||
------------------------------------------------------------------->
|
||||
(S\NP) {\y.have(y,exists z.book(z))}
|
||||
-----------------------------------------------------------------------------<
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
>>> printCCGDerivation(parses[1])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
--------------------------------------------------------->B
|
||||
((S\NP)/N) {\P y.have(y,exists z.P(z))}
|
||||
------------------------------------------------------------------->
|
||||
(S\NP) {\y.have(y,exists z.book(z))}
|
||||
-----------------------------------------------------------------------------<
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
>>> printCCGDerivation(parses[2])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
---------->T
|
||||
(S/(S\NP)) {\F.F(she)}
|
||||
------------------------------------->
|
||||
NP {exists z.book(z)}
|
||||
------------------------------------------------------------------->
|
||||
(S\NP) {\y.have(y,exists z.book(z))}
|
||||
----------------------------------------------------------------------------->
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
>>> printCCGDerivation(parses[3])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
---------->T
|
||||
(S/(S\NP)) {\F.F(she)}
|
||||
--------------------------------------------------------->B
|
||||
((S\NP)/N) {\P y.have(y,exists z.P(z))}
|
||||
------------------------------------------------------------------->
|
||||
(S\NP) {\y.have(y,exists z.book(z))}
|
||||
----------------------------------------------------------------------------->
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
>>> printCCGDerivation(parses[4])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
---------->T
|
||||
(S/(S\NP)) {\F.F(she)}
|
||||
---------------------------------------->B
|
||||
(S/NP) {\x.have(she,x)}
|
||||
------------------------------------->
|
||||
NP {exists z.book(z)}
|
||||
----------------------------------------------------------------------------->
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
>>> printCCGDerivation(parses[5])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
---------->T
|
||||
(S/(S\NP)) {\F.F(she)}
|
||||
--------------------------------------------------------->B
|
||||
((S\NP)/N) {\P y.have(y,exists z.P(z))}
|
||||
------------------------------------------------------------------->B
|
||||
(S/N) {\P.have(she,exists z.P(z))}
|
||||
----------------------------------------------------------------------------->
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
>>> printCCGDerivation(parses[6])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book}
|
||||
---------->T
|
||||
(S/(S\NP)) {\F.F(she)}
|
||||
---------------------------------------->B
|
||||
(S/NP) {\x.have(she,x)}
|
||||
------------------------------------------------------------------->B
|
||||
(S/N) {\P.have(she,exists z.P(z))}
|
||||
----------------------------------------------------------------------------->
|
||||
S {have(she,exists z.book(z))}
|
||||
|
||||
Complex semantics
|
||||
-------------------
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, NP, N
|
||||
... She => NP {she}
|
||||
... has => (S\\NP)/NP {\\x y.have(y, x)}
|
||||
... a => ((S\\NP)\\((S\\NP)/NP))/N {\\P R x.(exists z.P(z) & R(z,x))}
|
||||
... book => N {book}
|
||||
... ''',
|
||||
... True)
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> parses = list(parser.parse("She has a book".split()))
|
||||
>>> print(str(len(parses)) + " parses")
|
||||
2 parses
|
||||
|
||||
>>> printCCGDerivation(parses[0])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book}
|
||||
---------------------------------------------------------------------->
|
||||
((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))}
|
||||
----------------------------------------------------------------------------------------------------<
|
||||
(S\NP) {\x.(exists z.book(z) & have(x,z))}
|
||||
--------------------------------------------------------------------------------------------------------------<
|
||||
S {(exists z.book(z) & have(she,z))}
|
||||
|
||||
>>> printCCGDerivation(parses[1])
|
||||
She has a book
|
||||
NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book}
|
||||
---------->T
|
||||
(S/(S\NP)) {\F.F(she)}
|
||||
---------------------------------------------------------------------->
|
||||
((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))}
|
||||
----------------------------------------------------------------------------------------------------<
|
||||
(S\NP) {\x.(exists z.book(z) & have(x,z))}
|
||||
-------------------------------------------------------------------------------------------------------------->
|
||||
S {(exists z.book(z) & have(she,z))}
|
||||
|
||||
Using conjunctions
|
||||
---------------------
|
||||
|
||||
# TODO: The semantics of "and" should have been more flexible
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, NP, N
|
||||
... I => NP {I}
|
||||
... cook => (S\\NP)/NP {\\x y.cook(x,y)}
|
||||
... and => var\\.,var/.,var {\\P Q x y.(P(x,y) & Q(x,y))}
|
||||
... eat => (S\\NP)/NP {\\x y.eat(x,y)}
|
||||
... the => NP/N {\\x.the(x)}
|
||||
... bacon => N {bacon}
|
||||
... ''',
|
||||
... True)
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> parses = list(parser.parse("I cook and eat the bacon".split()))
|
||||
>>> print(str(len(parses)) + " parses")
|
||||
7 parses
|
||||
|
||||
>>> printCCGDerivation(parses[0])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
------------------------------->
|
||||
NP {the(bacon)}
|
||||
-------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
(S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
|
||||
----------------------------------------------------------------------------------------------------------------------------------------------------------<
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
>>> printCCGDerivation(parses[1])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
--------------------------------------------------------------------------------------------------------------------------------------->B
|
||||
((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))}
|
||||
-------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
(S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
|
||||
----------------------------------------------------------------------------------------------------------------------------------------------------------<
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
>>> printCCGDerivation(parses[2])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
------------------------------->
|
||||
NP {the(bacon)}
|
||||
-------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
(S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
>>> printCCGDerivation(parses[3])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
--------------------------------------------------------------------------------------------------------------------------------------->B
|
||||
((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))}
|
||||
-------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
(S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))}
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
>>> printCCGDerivation(parses[4])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
--------------------------------------------------------------------------------------------------------------------------->B
|
||||
(S/NP) {\x.(eat(x,I) & cook(x,I))}
|
||||
------------------------------->
|
||||
NP {the(bacon)}
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
>>> printCCGDerivation(parses[5])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
--------------------------------------------------------------------------------------------------------------------------------------->B
|
||||
((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))}
|
||||
----------------------------------------------------------------------------------------------------------------------------------------------->B
|
||||
(S/N) {\x.(eat(the(x),I) & cook(the(x),I))}
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
>>> printCCGDerivation(parses[6])
|
||||
I cook and eat the bacon
|
||||
NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
------------------------------------------------------------------------------------->
|
||||
(((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))}
|
||||
-------------------------------------------------------------------------------------------------------------------<
|
||||
((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))}
|
||||
--------------------------------------------------------------------------------------------------------------------------->B
|
||||
(S/NP) {\x.(eat(x,I) & cook(x,I))}
|
||||
----------------------------------------------------------------------------------------------------------------------------------------------->B
|
||||
(S/N) {\x.(eat(the(x),I) & cook(the(x),I))}
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------------------->
|
||||
S {(eat(the(bacon),I) & cook(the(bacon),I))}
|
||||
|
||||
Tests from published papers
|
||||
------------------------------
|
||||
|
||||
An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- S, NP
|
||||
... I => NP {I}
|
||||
... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)}
|
||||
... them => NP {them}
|
||||
... money => NP {money}
|
||||
... ''',
|
||||
... True)
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> parses = list(parser.parse("I give them money".split()))
|
||||
>>> print(str(len(parses)) + " parses")
|
||||
3 parses
|
||||
|
||||
>>> printCCGDerivation(parses[0])
|
||||
I give them money
|
||||
NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money}
|
||||
-------------------------------------------------->
|
||||
((S\NP)/NP) {\y z.give(y,them,z)}
|
||||
-------------------------------------------------------------->
|
||||
(S\NP) {\z.give(money,them,z)}
|
||||
----------------------------------------------------------------------<
|
||||
S {give(money,them,I)}
|
||||
|
||||
>>> printCCGDerivation(parses[1])
|
||||
I give them money
|
||||
NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
-------------------------------------------------->
|
||||
((S\NP)/NP) {\y z.give(y,them,z)}
|
||||
-------------------------------------------------------------->
|
||||
(S\NP) {\z.give(money,them,z)}
|
||||
---------------------------------------------------------------------->
|
||||
S {give(money,them,I)}
|
||||
|
||||
|
||||
>>> printCCGDerivation(parses[2])
|
||||
I give them money
|
||||
NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
-------------------------------------------------->
|
||||
((S\NP)/NP) {\y z.give(y,them,z)}
|
||||
---------------------------------------------------------->B
|
||||
(S/NP) {\y.give(y,them,I)}
|
||||
---------------------------------------------------------------------->
|
||||
S {give(money,them,I)}
|
||||
|
||||
|
||||
An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf
|
||||
|
||||
>>> lex = lexicon.fromstring('''
|
||||
... :- N, NP, S
|
||||
... money => N {money}
|
||||
... that => (N\\N)/(S/NP) {\\P Q x.(P(x) & Q(x))}
|
||||
... I => NP {I}
|
||||
... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)}
|
||||
... them => NP {them}
|
||||
... ''',
|
||||
... True)
|
||||
|
||||
>>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet)
|
||||
>>> parses = list(parser.parse("money that I give them".split()))
|
||||
>>> print(str(len(parses)) + " parses")
|
||||
3 parses
|
||||
|
||||
>>> printCCGDerivation(parses[0])
|
||||
money that I give them
|
||||
N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
-------------------------------------------------->
|
||||
((S\NP)/NP) {\y z.give(y,them,z)}
|
||||
---------------------------------------------------------->B
|
||||
(S/NP) {\y.give(y,them,I)}
|
||||
------------------------------------------------------------------------------------------------->
|
||||
(N\N) {\Q x.(give(x,them,I) & Q(x))}
|
||||
------------------------------------------------------------------------------------------------------------<
|
||||
N {\x.(give(x,them,I) & money(x))}
|
||||
|
||||
>>> printCCGDerivation(parses[1])
|
||||
money that I give them
|
||||
N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them}
|
||||
----------->T
|
||||
(N/(N\N)) {\F.F(money)}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
-------------------------------------------------->
|
||||
((S\NP)/NP) {\y z.give(y,them,z)}
|
||||
---------------------------------------------------------->B
|
||||
(S/NP) {\y.give(y,them,I)}
|
||||
------------------------------------------------------------------------------------------------->
|
||||
(N\N) {\Q x.(give(x,them,I) & Q(x))}
|
||||
------------------------------------------------------------------------------------------------------------>
|
||||
N {\x.(give(x,them,I) & money(x))}
|
||||
|
||||
>>> printCCGDerivation(parses[2])
|
||||
money that I give them
|
||||
N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them}
|
||||
----------->T
|
||||
(N/(N\N)) {\F.F(money)}
|
||||
-------------------------------------------------->B
|
||||
(N/(S/NP)) {\P x.(P(x) & money(x))}
|
||||
-------->T
|
||||
(S/(S\NP)) {\F.F(I)}
|
||||
-------------------------------------------------->
|
||||
((S\NP)/NP) {\y z.give(y,them,z)}
|
||||
---------------------------------------------------------->B
|
||||
(S/NP) {\y.give(y,them,I)}
|
||||
------------------------------------------------------------------------------------------------------------>
|
||||
N {\x.(give(x,them,I) & money(x))}
|
||||
|
||||
|
||||
-------
|
||||
Lexicon
|
||||
-------
|
||||
|
||||
>>> from nltk.ccg import lexicon
|
||||
|
||||
Parse lexicon with semantics
|
||||
|
||||
>>> print(str(lexicon.fromstring(
|
||||
... '''
|
||||
... :- S,NP
|
||||
...
|
||||
... IntransVsg :: S\\NP[sg]
|
||||
...
|
||||
... sleeps => IntransVsg {\\x.sleep(x)}
|
||||
... eats => S\\NP[sg]/NP {\\x y.eat(x,y)}
|
||||
...
|
||||
... and => var\\var/var {\\x y.x & y}
|
||||
... ''',
|
||||
... True
|
||||
... )))
|
||||
and => ((_var0\_var0)/_var0) {(\x y.x & y)}
|
||||
eats => ((S\NP['sg'])/NP) {\x y.eat(x,y)}
|
||||
sleeps => (S\NP['sg']) {\x.sleep(x)}
|
||||
|
||||
Parse lexicon without semantics
|
||||
|
||||
>>> print(str(lexicon.fromstring(
|
||||
... '''
|
||||
... :- S,NP
|
||||
...
|
||||
... IntransVsg :: S\\NP[sg]
|
||||
...
|
||||
... sleeps => IntransVsg
|
||||
... eats => S\\NP[sg]/NP {sem=\\x y.eat(x,y)}
|
||||
...
|
||||
... and => var\\var/var
|
||||
... ''',
|
||||
... False
|
||||
... )))
|
||||
and => ((_var0\_var0)/_var0)
|
||||
eats => ((S\NP['sg'])/NP)
|
||||
sleeps => (S\NP['sg'])
|
||||
|
||||
Semantics are missing
|
||||
|
||||
>>> print(str(lexicon.fromstring(
|
||||
... '''
|
||||
... :- S,NP
|
||||
...
|
||||
... eats => S\\NP[sg]/NP
|
||||
... ''',
|
||||
... True
|
||||
... )))
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AssertionError: eats => S\NP[sg]/NP must contain semantics because include_semantics is set to True
|
||||
|
||||
|
||||
------------------------------------
|
||||
CCG combinator semantics computation
|
||||
------------------------------------
|
||||
|
||||
>>> from nltk.sem.logic import *
|
||||
>>> from nltk.ccg.logic import *
|
||||
|
||||
>>> read_expr = Expression.fromstring
|
||||
|
||||
Compute semantics from function application
|
||||
|
||||
>>> print(str(compute_function_semantics(read_expr(r'\x.P(x)'), read_expr(r'book'))))
|
||||
P(book)
|
||||
|
||||
>>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'read'))))
|
||||
read(book)
|
||||
|
||||
>>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'\x.read(x)'))))
|
||||
read(book)
|
||||
|
||||
Compute semantics from composition
|
||||
|
||||
>>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'\x.Q(x)'))))
|
||||
\x.P(Q(x))
|
||||
|
||||
>>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'read'))))
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AssertionError: `read` must be a lambda expression
|
||||
|
||||
Compute semantics from substitution
|
||||
|
||||
>>> print(str(compute_substitution_semantics(read_expr(r'\x y.P(x,y)'), read_expr(r'\x.Q(x)'))))
|
||||
\x.P(x,Q(x))
|
||||
|
||||
>>> print(str(compute_substitution_semantics(read_expr(r'\x.P(x)'), read_expr(r'read'))))
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AssertionError: `\x.P(x)` must be a lambda expression with 2 arguments
|
||||
|
||||
Compute type-raise semantics
|
||||
|
||||
>>> print(str(compute_type_raised_semantics(read_expr(r'\x.P(x)'))))
|
||||
\F x.F(P(x))
|
||||
|
||||
>>> print(str(compute_type_raised_semantics(read_expr(r'\x.F(x)'))))
|
||||
\F1 x.F1(F(x))
|
||||
|
||||
>>> print(str(compute_type_raised_semantics(read_expr(r'\x y z.P(x,y,z)'))))
|
||||
\F x y z.F(P(x,y,z))
|
||||
@@ -0,0 +1,232 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=======
|
||||
Chat-80
|
||||
=======
|
||||
|
||||
Chat-80 was a natural language system which allowed the user to
|
||||
interrogate a Prolog knowledge base in the domain of world
|
||||
geography. It was developed in the early '80s by Warren and Pereira; see
|
||||
`<https://aclanthology.org/J82-3002.pdf>`_ for a description and
|
||||
`<http://www.cis.upenn.edu/~pereira/oldies.html>`_ for the source
|
||||
files.
|
||||
|
||||
The ``chat80`` module contains functions to extract data from the Chat-80
|
||||
relation files ('the world database'), and convert then into a format
|
||||
that can be incorporated in the FOL models of
|
||||
``nltk.sem.evaluate``. The code assumes that the Prolog
|
||||
input files are available in the NLTK corpora directory.
|
||||
|
||||
The Chat-80 World Database consists of the following files::
|
||||
|
||||
world0.pl
|
||||
rivers.pl
|
||||
cities.pl
|
||||
countries.pl
|
||||
contain.pl
|
||||
borders.pl
|
||||
|
||||
This module uses a slightly modified version of ``world0.pl``, in which
|
||||
a set of Prolog rules have been omitted. The modified file is named
|
||||
``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since
|
||||
it uses a list rather than a string in the second field.
|
||||
|
||||
Reading Chat-80 Files
|
||||
=====================
|
||||
|
||||
Chat-80 relations are like tables in a relational database. The
|
||||
relation acts as the name of the table; the first argument acts as the
|
||||
'primary key'; and subsequent arguments are further fields in the
|
||||
table. In general, the name of the table provides a label for a unary
|
||||
predicate whose extension is all the primary keys. For example,
|
||||
relations in ``cities.pl`` are of the following form::
|
||||
|
||||
'city(athens,greece,1368).'
|
||||
|
||||
Here, ``'athens'`` is the key, and will be mapped to a member of the
|
||||
unary predicate *city*.
|
||||
|
||||
By analogy with NLTK corpora, ``chat80`` defines a number of 'items'
|
||||
which correspond to the relations.
|
||||
|
||||
>>> from nltk.sem import chat80
|
||||
>>> print(chat80.items)
|
||||
('borders', 'circle_of_lat', 'circle_of_long', 'city', ...)
|
||||
|
||||
The fields in the table are mapped to binary predicates. The first
|
||||
argument of the predicate is the primary key, while the second
|
||||
argument is the data in the relevant field. Thus, in the above
|
||||
example, the third field is mapped to the binary predicate
|
||||
*population_of*, whose extension is a set of pairs such as
|
||||
``'(athens, 1368)'``.
|
||||
|
||||
An exception to this general framework is required by the relations in
|
||||
the files ``borders.pl`` and ``contains.pl``. These contain facts of the
|
||||
following form::
|
||||
|
||||
'borders(albania,greece).'
|
||||
|
||||
'contains0(africa,central_africa).'
|
||||
|
||||
We do not want to form a unary concept out the element in
|
||||
the first field of these records, and we want the label of the binary
|
||||
relation just to be ``'border'``/``'contain'`` respectively.
|
||||
|
||||
In order to drive the extraction process, we use 'relation metadata bundles'
|
||||
which are Python dictionaries such as the following::
|
||||
|
||||
city = {'label': 'city',
|
||||
'closures': [],
|
||||
'schema': ['city', 'country', 'population'],
|
||||
'filename': 'cities.pl'}
|
||||
|
||||
According to this, the file ``city['filename']`` contains a list of
|
||||
relational tuples (or more accurately, the corresponding strings in
|
||||
Prolog form) whose predicate symbol is ``city['label']`` and whose
|
||||
relational schema is ``city['schema']``. The notion of a ``closure`` is
|
||||
discussed in the next section.
|
||||
|
||||
Concepts
|
||||
========
|
||||
In order to encapsulate the results of the extraction, a class of
|
||||
``Concept``\ s is introduced. A ``Concept`` object has a number of
|
||||
attributes, in particular a ``prefLabel``, an arity and ``extension``.
|
||||
|
||||
>>> c1 = chat80.Concept('dog', arity=1, extension=set(['d1', 'd2']))
|
||||
>>> print(c1)
|
||||
Label = 'dog'
|
||||
Arity = 1
|
||||
Extension = ['d1', 'd2']
|
||||
|
||||
|
||||
|
||||
The ``extension`` attribute makes it easier to inspect the output of
|
||||
the extraction.
|
||||
|
||||
>>> schema = ['city', 'country', 'population']
|
||||
>>> concepts = chat80.clause2concepts('cities.pl', 'city', schema)
|
||||
>>> concepts
|
||||
[Concept('city'), Concept('country_of'), Concept('population_of')]
|
||||
>>> for c in concepts:
|
||||
... print("%s:\n\t%s" % (c.prefLabel, c.extension[:4]))
|
||||
city:
|
||||
['athens', 'bangkok', 'barcelona', 'berlin']
|
||||
country_of:
|
||||
[('athens', 'greece'), ('bangkok', 'thailand'), ('barcelona', 'spain'), ('berlin', 'east_germany')]
|
||||
population_of:
|
||||
[('athens', '1368'), ('bangkok', '1178'), ('barcelona', '1280'), ('berlin', '3481')]
|
||||
|
||||
In addition, the ``extension`` can be further
|
||||
processed: in the case of the ``'border'`` relation, we check that the
|
||||
relation is **symmetric**, and in the case of the ``'contain'``
|
||||
relation, we carry out the **transitive closure**. The closure
|
||||
properties associated with a concept is indicated in the relation
|
||||
metadata, as indicated earlier.
|
||||
|
||||
>>> borders = set([('a1', 'a2'), ('a2', 'a3')])
|
||||
>>> c2 = chat80.Concept('borders', arity=2, extension=borders)
|
||||
>>> print(c2)
|
||||
Label = 'borders'
|
||||
Arity = 2
|
||||
Extension = [('a1', 'a2'), ('a2', 'a3')]
|
||||
>>> c3 = chat80.Concept('borders', arity=2, closures=['symmetric'], extension=borders)
|
||||
>>> c3.close()
|
||||
>>> print(c3)
|
||||
Label = 'borders'
|
||||
Arity = 2
|
||||
Extension = [('a1', 'a2'), ('a2', 'a1'), ('a2', 'a3'), ('a3', 'a2')]
|
||||
|
||||
The ``extension`` of a ``Concept`` object is then incorporated into a
|
||||
``Valuation`` object.
|
||||
|
||||
Persistence
|
||||
===========
|
||||
The functions ``val_dump`` and ``val_load`` are provided to allow a
|
||||
valuation to be stored in a persistent database and re-loaded, rather
|
||||
than having to be re-computed each time.
|
||||
|
||||
Individuals and Lexical Items
|
||||
=============================
|
||||
As well as deriving relations from the Chat-80 data, we also create a
|
||||
set of individual constants, one for each entity in the domain. The
|
||||
individual constants are string-identical to the entities. For
|
||||
example, given a data item such as ``'zloty'``, we add to the valuation
|
||||
a pair ``('zloty', 'zloty')``. In order to parse English sentences that
|
||||
refer to these entities, we also create a lexical item such as the
|
||||
following for each individual constant::
|
||||
|
||||
PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty'
|
||||
|
||||
The set of rules is written to the file ``chat_pnames.fcfg`` in the
|
||||
current directory.
|
||||
|
||||
SQL Query
|
||||
=========
|
||||
|
||||
The ``city`` relation is also available in RDB form and can be queried
|
||||
using SQL statements.
|
||||
|
||||
>>> import nltk
|
||||
>>> q = "SELECT City, Population FROM city_table WHERE Country = 'china' and Population > 1000"
|
||||
>>> for answer in chat80.sql_query('corpora/city_database/city.db', q):
|
||||
... print("%-10s %4s" % answer)
|
||||
canton 1496
|
||||
chungking 1100
|
||||
mukden 1551
|
||||
peking 2031
|
||||
shanghai 5407
|
||||
tientsin 1795
|
||||
|
||||
The (deliberately naive) grammar ``sql.fcfg`` translates from English
|
||||
to SQL:
|
||||
|
||||
>>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg')
|
||||
% start S
|
||||
S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp]
|
||||
VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp]
|
||||
VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap]
|
||||
NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n]
|
||||
PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np]
|
||||
AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp]
|
||||
NP[SEM='Country="greece"'] -> 'Greece'
|
||||
NP[SEM='Country="china"'] -> 'China'
|
||||
Det[SEM='SELECT'] -> 'Which' | 'What'
|
||||
N[SEM='City FROM city_table'] -> 'cities'
|
||||
IV[SEM=''] -> 'are'
|
||||
A[SEM=''] -> 'located'
|
||||
P[SEM=''] -> 'in'
|
||||
|
||||
Given this grammar, we can express, and then execute, queries in English.
|
||||
|
||||
>>> cp = nltk.parse.load_parser('grammars/book_grammars/sql0.fcfg')
|
||||
>>> query = 'What cities are in China'
|
||||
>>> for tree in cp.parse(query.split()):
|
||||
... answer = tree.label()['SEM']
|
||||
... q = " ".join(answer)
|
||||
... print(q)
|
||||
...
|
||||
SELECT City FROM city_table WHERE Country="china"
|
||||
|
||||
>>> rows = chat80.sql_query('corpora/city_database/city.db', q)
|
||||
>>> for r in rows: print("%s" % r, end=' ')
|
||||
canton chungking dairen harbin kowloon mukden peking shanghai sian tientsin
|
||||
|
||||
|
||||
Using Valuations
|
||||
-----------------
|
||||
|
||||
In order to convert such an extension into a valuation, we use the
|
||||
``make_valuation()`` method; setting ``read=True`` creates and returns
|
||||
a new ``Valuation`` object which contains the results.
|
||||
|
||||
>>> val = chat80.make_valuation(concepts, read=True)
|
||||
>>> 'calcutta' in val['city']
|
||||
True
|
||||
>>> [town for (town, country) in val['country_of'] if country == 'india']
|
||||
['bombay', 'calcutta', 'delhi', 'hyderabad', 'madras']
|
||||
>>> dom = val.domain
|
||||
>>> g = nltk.sem.Assignment(dom)
|
||||
>>> m = nltk.sem.Model(dom, val)
|
||||
>>> m.evaluate(r'population_of(jakarta, 533)', g)
|
||||
True
|
||||
@@ -0,0 +1,190 @@
|
||||
=======================
|
||||
CHILDES Corpus Readers
|
||||
=======================
|
||||
|
||||
Read the XML version of the CHILDES corpus.
|
||||
|
||||
Setup
|
||||
=====
|
||||
|
||||
>>> from nltk.test.childes_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
How to use CHILDESCorpusReader
|
||||
==============================
|
||||
|
||||
Read the CHILDESCorpusReader class and read the CHILDES corpus saved in
|
||||
the nltk_data directory.
|
||||
|
||||
>>> import nltk
|
||||
>>> from nltk.corpus.reader import CHILDESCorpusReader
|
||||
>>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
|
||||
|
||||
Reading files in the Valian corpus (Valian, 1991).
|
||||
|
||||
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
|
||||
>>> valian.fileids()
|
||||
['Valian/01a.xml', 'Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml',...
|
||||
|
||||
Count the number of files
|
||||
|
||||
>>> len(valian.fileids())
|
||||
43
|
||||
|
||||
Printing properties of the corpus files.
|
||||
|
||||
>>> corpus_data = valian.corpus(valian.fileids())
|
||||
>>> print(corpus_data[0]['Lang'])
|
||||
eng
|
||||
>>> for key in sorted(corpus_data[0].keys()):
|
||||
... print(key, ": ", corpus_data[0][key])
|
||||
Corpus : valian
|
||||
Date : 1986-03-04
|
||||
Id : 01a
|
||||
Lang : eng
|
||||
Version : 2.0.1
|
||||
{http://www.w3.org/2001/XMLSchema-instance}schemaLocation : http://www.talkbank.org/ns/talkbank http://talkbank.org/software/talkbank.xsd
|
||||
|
||||
Printing information of participants of the corpus. The most common codes for
|
||||
the participants are 'CHI' (target child), 'MOT' (mother), and 'INV' (investigator).
|
||||
|
||||
>>> corpus_participants = valian.participants(valian.fileids())
|
||||
>>> for this_corpus_participants in corpus_participants[:2]:
|
||||
... for key in sorted(this_corpus_participants.keys()):
|
||||
... dct = this_corpus_participants[key]
|
||||
... print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
|
||||
CHI : [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')]
|
||||
INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')]
|
||||
MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')]
|
||||
CHI : [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')]
|
||||
INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')]
|
||||
MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')]
|
||||
|
||||
printing words.
|
||||
|
||||
>>> valian.words('Valian/01a.xml')
|
||||
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
|
||||
|
||||
printing sentences.
|
||||
|
||||
>>> valian.sents('Valian/01a.xml')
|
||||
[['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname',
|
||||
'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when',
|
||||
'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'],
|
||||
["that's", 'okay'], ...
|
||||
|
||||
You can specify the participants with the argument *speaker*.
|
||||
|
||||
>>> valian.words('Valian/01a.xml',speaker=['INV'])
|
||||
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
|
||||
>>> valian.words('Valian/01a.xml',speaker=['MOT'])
|
||||
["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ...
|
||||
>>> valian.words('Valian/01a.xml',speaker=['CHI'])
|
||||
['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',...
|
||||
|
||||
|
||||
tagged_words() and tagged_sents() return the usual (word,pos) tuple lists.
|
||||
POS tags in the CHILDES are automatically assigned by MOR and POST programs
|
||||
(MacWhinney, 2000).
|
||||
|
||||
>>> valian.tagged_words('Valian/01a.xml')[:30]
|
||||
[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'),
|
||||
('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'),
|
||||
('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'),
|
||||
('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'),
|
||||
('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'),
|
||||
('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'),
|
||||
('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'),
|
||||
('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')]
|
||||
|
||||
>>> valian.tagged_sents('Valian/01a.xml')[:10]
|
||||
[[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'),
|
||||
('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'),
|
||||
('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'),
|
||||
('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'),
|
||||
('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')],
|
||||
[("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')],
|
||||
[("that's", 'pro:dem'), ('okay', 'adj')],
|
||||
[('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'),
|
||||
('eighty', 'det:num'), ('four', 'det:num')],
|
||||
[('great', 'adj')],
|
||||
[('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')],
|
||||
[('correct', 'adj')],
|
||||
[('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'),
|
||||
('a', 'det'), ('month', 'n'), ('ago', 'adv')]]
|
||||
|
||||
When the argument *stem* is true, the word stems (e.g., 'is' -> 'be-3PS') are
|
||||
used instead of the original words.
|
||||
|
||||
>>> valian.words('Valian/01a.xml')[:30]
|
||||
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ...
|
||||
>>> valian.words('Valian/01a.xml',stem=True)[:30]
|
||||
['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ...
|
||||
|
||||
When the argument *replace* is true, the replaced words are used instead of
|
||||
the original words.
|
||||
|
||||
>>> valian.words('Valian/01a.xml',speaker='CHI')[247]
|
||||
'tikteat'
|
||||
>>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247]
|
||||
'trick'
|
||||
|
||||
When the argument *relation* is true, the relational relationships in the
|
||||
sentence are returned. See Sagae et al. (2010) for details of the relational
|
||||
structure adopted in the CHILDES.
|
||||
|
||||
>>> valian.words('Valian/01a.xml',relation=True)[:10]
|
||||
[[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]]
|
||||
|
||||
Printing age. When the argument *month* is true, the age information in
|
||||
the CHILDES format is converted into the number of months.
|
||||
|
||||
>>> valian.age()
|
||||
['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ...
|
||||
>>> valian.age('Valian/01a.xml')
|
||||
['P2Y1M3D']
|
||||
>>> valian.age('Valian/01a.xml',month=True)
|
||||
[25]
|
||||
|
||||
Printing MLU. The criteria for the MLU computation is broadly based on
|
||||
Brown (1973).
|
||||
|
||||
>>> valian.MLU()
|
||||
[2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490...,
|
||||
2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080...,
|
||||
4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284...,
|
||||
4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936...,
|
||||
5.25, 5.154696132596..., ...]
|
||||
|
||||
>>> valian.MLU('Valian/01a.xml')
|
||||
[2.35746606334...]
|
||||
|
||||
|
||||
Basic stuff
|
||||
==============================
|
||||
|
||||
Count the number of words and sentences of each file.
|
||||
|
||||
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
|
||||
>>> for this_file in valian.fileids()[:6]:
|
||||
... print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id'])
|
||||
... print("num of words: %i" % len(valian.words(this_file)))
|
||||
... print("num of sents: %i" % len(valian.sents(this_file)))
|
||||
valian 01a
|
||||
num of words: 3606
|
||||
num of sents: 1027
|
||||
valian 01b
|
||||
num of words: 4376
|
||||
num of sents: 1274
|
||||
valian 02a
|
||||
num of words: 2673
|
||||
num of sents: 801
|
||||
valian 02b
|
||||
num of words: 5020
|
||||
num of sents: 1583
|
||||
valian 03a
|
||||
num of words: 2743
|
||||
num of sents: 988
|
||||
valian 03b
|
||||
num of words: 4409
|
||||
num of sents: 1397
|
||||
@@ -0,0 +1,13 @@
|
||||
def setup_module():
|
||||
import pytest
|
||||
|
||||
import nltk.data
|
||||
|
||||
try:
|
||||
nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/")
|
||||
except LookupError as e:
|
||||
pytest.skip(
|
||||
"The CHILDES corpus is not found. "
|
||||
"It should be manually downloaded and saved/unpacked "
|
||||
"to [NLTK_Data_Dir]/corpora/childes/"
|
||||
)
|
||||
@@ -0,0 +1,372 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==========
|
||||
Chunking
|
||||
==========
|
||||
|
||||
>>> from nltk.chunk import *
|
||||
>>> from nltk.chunk.util import *
|
||||
>>> from nltk.chunk.regexp import *
|
||||
>>> from nltk import Tree
|
||||
|
||||
>>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./."
|
||||
>>> gold_chunked_text = tagstr2tree(tagged_text)
|
||||
>>> unchunked_text = gold_chunked_text.flatten()
|
||||
|
||||
Chunking uses a special regexp syntax for rules that delimit the chunks. These
|
||||
rules must be converted to 'regular' regular expressions before a sentence can
|
||||
be chunked.
|
||||
|
||||
>>> tag_pattern = "<DT>?<JJ>*<NN.*>"
|
||||
>>> regexp_pattern = tag_pattern2re_pattern(tag_pattern)
|
||||
>>> regexp_pattern
|
||||
'(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'
|
||||
|
||||
Construct some new chunking rules.
|
||||
|
||||
>>> chunk_rule = ChunkRule(r"<.*>+", "Chunk everything")
|
||||
>>> strip_rule = StripRule(r"<VBD|IN|\.>", "Strip on verbs/prepositions")
|
||||
>>> split_rule = SplitRule("<DT><NN>", "<DT><NN>",
|
||||
... "Split successive determiner/noun pairs")
|
||||
|
||||
|
||||
Create and score a series of chunk parsers, successively more complex.
|
||||
|
||||
>>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
|
||||
>>> chunked_text = chunk_parser.parse(unchunked_text)
|
||||
>>> print(chunked_text)
|
||||
(S
|
||||
(NP
|
||||
The/DT
|
||||
cat/NN
|
||||
sat/VBD
|
||||
on/IN
|
||||
the/DT
|
||||
mat/NN
|
||||
the/DT
|
||||
dog/NN
|
||||
chewed/VBD
|
||||
./.))
|
||||
|
||||
>>> chunkscore = ChunkScore()
|
||||
>>> chunkscore.score(gold_chunked_text, chunked_text)
|
||||
>>> print(chunkscore.precision())
|
||||
0.0
|
||||
|
||||
>>> print(chunkscore.recall())
|
||||
0.0
|
||||
|
||||
>>> print(chunkscore.f_measure())
|
||||
0
|
||||
|
||||
>>> for chunk in sorted(chunkscore.missed()): print(chunk)
|
||||
(NP The/DT cat/NN)
|
||||
(NP the/DT dog/NN)
|
||||
(NP the/DT mat/NN)
|
||||
|
||||
>>> for chunk in chunkscore.incorrect(): print(chunk)
|
||||
(NP
|
||||
The/DT
|
||||
cat/NN
|
||||
sat/VBD
|
||||
on/IN
|
||||
the/DT
|
||||
mat/NN
|
||||
the/DT
|
||||
dog/NN
|
||||
chewed/VBD
|
||||
./.)
|
||||
|
||||
>>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule],
|
||||
... chunk_label='NP')
|
||||
>>> chunked_text = chunk_parser.parse(unchunked_text)
|
||||
>>> print(chunked_text)
|
||||
(S
|
||||
(NP The/DT cat/NN)
|
||||
sat/VBD
|
||||
on/IN
|
||||
(NP the/DT mat/NN the/DT dog/NN)
|
||||
chewed/VBD
|
||||
./.)
|
||||
>>> assert chunked_text == chunk_parser.parse(list(unchunked_text))
|
||||
|
||||
>>> chunkscore = ChunkScore()
|
||||
>>> chunkscore.score(gold_chunked_text, chunked_text)
|
||||
>>> chunkscore.precision()
|
||||
0.5
|
||||
|
||||
>>> print(chunkscore.recall())
|
||||
0.33333333...
|
||||
|
||||
>>> print(chunkscore.f_measure())
|
||||
0.4
|
||||
|
||||
>>> for chunk in sorted(chunkscore.missed()): print(chunk)
|
||||
(NP the/DT dog/NN)
|
||||
(NP the/DT mat/NN)
|
||||
|
||||
>>> for chunk in chunkscore.incorrect(): print(chunk)
|
||||
(NP the/DT mat/NN the/DT dog/NN)
|
||||
|
||||
>>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule, split_rule],
|
||||
... chunk_label='NP')
|
||||
>>> chunked_text = chunk_parser.parse(unchunked_text, trace=True)
|
||||
# Input:
|
||||
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
|
||||
# Chunk everything:
|
||||
{<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>}
|
||||
# Strip on verbs/prepositions:
|
||||
{<DT> <NN>} <VBD> <IN> {<DT> <NN> <DT> <NN>} <VBD> <.>
|
||||
# Split successive determiner/noun pairs:
|
||||
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
|
||||
>>> print(chunked_text)
|
||||
(S
|
||||
(NP The/DT cat/NN)
|
||||
sat/VBD
|
||||
on/IN
|
||||
(NP the/DT mat/NN)
|
||||
(NP the/DT dog/NN)
|
||||
chewed/VBD
|
||||
./.)
|
||||
|
||||
>>> chunkscore = ChunkScore()
|
||||
>>> chunkscore.score(gold_chunked_text, chunked_text)
|
||||
>>> chunkscore.precision()
|
||||
1.0
|
||||
|
||||
>>> chunkscore.recall()
|
||||
1.0
|
||||
|
||||
>>> chunkscore.f_measure()
|
||||
1.0
|
||||
|
||||
>>> chunkscore.missed()
|
||||
[]
|
||||
|
||||
>>> chunkscore.incorrect()
|
||||
[]
|
||||
|
||||
>>> chunk_parser.rules()
|
||||
[<ChunkRule: '<.*>+'>, <StripRule: '<VBD|IN|\\.>'>,
|
||||
<SplitRule: '<DT><NN>', '<DT><NN>'>]
|
||||
|
||||
Printing parsers:
|
||||
|
||||
>>> print(repr(chunk_parser))
|
||||
<RegexpChunkParser with 3 rules>
|
||||
>>> print(chunk_parser)
|
||||
RegexpChunkParser with 3 rules:
|
||||
Chunk everything
|
||||
<ChunkRule: '<.*>+'>
|
||||
Strip on verbs/prepositions
|
||||
<StripRule: '<VBD|IN|\\.>'>
|
||||
Split successive determiner/noun pairs
|
||||
<SplitRule: '<DT><NN>', '<DT><NN>'>
|
||||
|
||||
Regression Tests
|
||||
~~~~~~~~~~~~~~~~
|
||||
ChunkParserI
|
||||
------------
|
||||
`ChunkParserI` is an abstract interface -- it is not meant to be
|
||||
instantiated directly.
|
||||
|
||||
>>> ChunkParserI().parse([])
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
|
||||
|
||||
ChunkString
|
||||
-----------
|
||||
ChunkString can be built from a tree of tagged tuples, a tree of
|
||||
trees, or a mixed list of both:
|
||||
|
||||
>>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)])
|
||||
>>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])])
|
||||
>>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])])
|
||||
>>> ChunkString(t1)
|
||||
<ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'>
|
||||
>>> ChunkString(t2)
|
||||
<ChunkString: '<t0><t1>'>
|
||||
>>> ChunkString(t3)
|
||||
<ChunkString: '<t0><t1>'>
|
||||
|
||||
Other values generate an error:
|
||||
|
||||
>>> ChunkString(Tree('S', ['x']))
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: chunk structures must contain tagged tokens or trees
|
||||
|
||||
The `str()` for a chunk string adds spaces to it, which makes it line
|
||||
up with `str()` output for other chunk strings over the same
|
||||
underlying input.
|
||||
|
||||
>>> cs = ChunkString(t1)
|
||||
>>> print(cs)
|
||||
<t0> <t1> <t2> <t3> <t4> <t5> <t6> <t7> <t8> <t9>
|
||||
>>> cs.xform('<t3>', '{<t3>}')
|
||||
>>> print(cs)
|
||||
<t0> <t1> <t2> {<t3>} <t4> <t5> <t6> <t7> <t8> <t9>
|
||||
|
||||
The `_verify()` method makes sure that our transforms don't corrupt
|
||||
the chunk string. By setting debug_level=2, `_verify()` will be
|
||||
called at the end of every call to `xform`.
|
||||
|
||||
>>> cs = ChunkString(t1, debug_level=3)
|
||||
|
||||
>>> # tag not marked with <...>:
|
||||
>>> cs.xform('<t3>', 't3')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Transformation generated invalid chunkstring:
|
||||
<t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>
|
||||
|
||||
>>> # brackets not balanced:
|
||||
>>> cs.xform('<t3>', '{<t3>')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Transformation generated invalid chunkstring:
|
||||
<t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>
|
||||
|
||||
>>> # nested brackets:
|
||||
>>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Transformation generated invalid chunkstring:
|
||||
<t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>
|
||||
|
||||
>>> # modified tags:
|
||||
>>> cs.xform('<t3>', '<t9>')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Transformation generated invalid chunkstring: tag changed
|
||||
|
||||
>>> # added tags:
|
||||
>>> cs.xform('<t9>', '<t9><t10>')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Transformation generated invalid chunkstring: tag changed
|
||||
|
||||
Chunking Rules
|
||||
--------------
|
||||
|
||||
Test the different rule constructors & __repr__ methods:
|
||||
|
||||
>>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_STRIP_PATTERN,
|
||||
... '{<a|b>}', 'chunk <a> and <b>')
|
||||
>>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_STRIP_PATTERN),
|
||||
... '{<a|b>}', 'chunk <a> and <b>')
|
||||
>>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>')
|
||||
>>> r4 = StripRule('<a|b>', 'strip <a> and <b>')
|
||||
>>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>')
|
||||
>>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>')
|
||||
>>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>')
|
||||
>>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>')
|
||||
>>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>')
|
||||
>>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9:
|
||||
... print(rule)
|
||||
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
|
||||
<RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
|
||||
<ChunkRule: '<a|b>'>
|
||||
<StripRule: '<a|b>'>
|
||||
<UnChunkRule: '<a|b>'>
|
||||
<MergeRule: '<a>', '<b>'>
|
||||
<SplitRule: '<a>', '<b>'>
|
||||
<ExpandLeftRule: '<a>', '<b>'>
|
||||
<ExpandRightRule: '<a>', '<b>'>
|
||||
|
||||
`tag_pattern2re_pattern()` complains if the tag pattern looks problematic:
|
||||
|
||||
>>> tag_pattern2re_pattern('{}')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Bad tag pattern: '{}'
|
||||
|
||||
RegexpChunkParser
|
||||
-----------------
|
||||
|
||||
A warning is printed when parsing an empty sentence:
|
||||
|
||||
>>> parser = RegexpChunkParser([ChunkRule('<a>', '')])
|
||||
>>> parser.parse(Tree('S', []))
|
||||
Warning: parsing empty text
|
||||
Tree('S', [])
|
||||
|
||||
RegexpParser
|
||||
------------
|
||||
|
||||
>>> parser = RegexpParser('''
|
||||
... NP: {<DT>? <JJ>* <NN>*} # NP
|
||||
... P: {<IN>} # Preposition
|
||||
... V: {<V.*>} # Verb
|
||||
... PP: {<P> <NP>} # PP -> P NP
|
||||
... VP: {<V> <NP|PP>*} # VP -> V (NP|PP)*
|
||||
... ''')
|
||||
>>> print(repr(parser))
|
||||
<chunk.RegexpParser with 5 stages>
|
||||
>>> print(parser)
|
||||
chunk.RegexpParser with 5 stages:
|
||||
RegexpChunkParser with 1 rules:
|
||||
NP <ChunkRule: '<DT>? <JJ>* <NN>*'>
|
||||
RegexpChunkParser with 1 rules:
|
||||
Preposition <ChunkRule: '<IN>'>
|
||||
RegexpChunkParser with 1 rules:
|
||||
Verb <ChunkRule: '<V.*>'>
|
||||
RegexpChunkParser with 1 rules:
|
||||
PP -> P NP <ChunkRule: '<P> <NP>'>
|
||||
RegexpChunkParser with 1 rules:
|
||||
VP -> V (NP|PP)* <ChunkRule: '<V> <NP|PP>*'>
|
||||
>>> print(parser.parse(unchunked_text, trace=True))
|
||||
# Input:
|
||||
<DT> <NN> <VBD> <IN> <DT> <NN> <DT> <NN> <VBD> <.>
|
||||
# NP:
|
||||
{<DT> <NN>} <VBD> <IN> {<DT> <NN>}{<DT> <NN>} <VBD> <.>
|
||||
# Input:
|
||||
<NP> <VBD> <IN> <NP> <NP> <VBD> <.>
|
||||
# Preposition:
|
||||
<NP> <VBD> {<IN>} <NP> <NP> <VBD> <.>
|
||||
# Input:
|
||||
<NP> <VBD> <P> <NP> <NP> <VBD> <.>
|
||||
# Verb:
|
||||
<NP> {<VBD>} <P> <NP> <NP> {<VBD>} <.>
|
||||
# Input:
|
||||
<NP> <V> <P> <NP> <NP> <V> <.>
|
||||
# PP -> P NP:
|
||||
<NP> <V> {<P> <NP>} <NP> <V> <.>
|
||||
# Input:
|
||||
<NP> <V> <PP> <NP> <V> <.>
|
||||
# VP -> V (NP|PP)*:
|
||||
<NP> {<V> <PP> <NP>}{<V>} <.>
|
||||
(S
|
||||
(NP The/DT cat/NN)
|
||||
(VP
|
||||
(V sat/VBD)
|
||||
(PP (P on/IN) (NP the/DT mat/NN))
|
||||
(NP the/DT dog/NN))
|
||||
(VP (V chewed/VBD))
|
||||
./.)
|
||||
|
||||
Test parsing of other rule types:
|
||||
|
||||
>>> print(RegexpParser('''
|
||||
... X:
|
||||
... }<a><b>{ # strip rule
|
||||
... <a>}{<b> # split rule
|
||||
... <a>{}<b> # merge rule
|
||||
... <a>{<b>}<c> # chunk rule w/ context
|
||||
... '''))
|
||||
chunk.RegexpParser with 1 stages:
|
||||
RegexpChunkParser with 4 rules:
|
||||
strip rule <StripRule: '<a><b>'>
|
||||
split rule <SplitRule: '<a>', '<b>'>
|
||||
merge rule <MergeRule: '<a>', '<b>'>
|
||||
chunk rule w/ context <ChunkRuleWithContext: '<a>', '<b>', '<c>'>
|
||||
|
||||
Illegal patterns give an error message:
|
||||
|
||||
>>> print(RegexpParser('X: {<foo>} {<bar>}'))
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Illegal chunk pattern: {<foo>} {<bar>}
|
||||
@@ -0,0 +1,202 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=============
|
||||
Classifiers
|
||||
=============
|
||||
|
||||
>>> from nltk.test.classify_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
Classifiers label tokens with category labels (or *class labels*).
|
||||
Typically, labels are represented with strings (such as ``"health"``
|
||||
or ``"sports"``. In NLTK, classifiers are defined using classes that
|
||||
implement the `ClassifierI` interface, which supports the following operations:
|
||||
|
||||
- self.classify(featureset)
|
||||
- self.classify_many(featuresets)
|
||||
- self.labels()
|
||||
- self.prob_classify(featureset)
|
||||
- self.prob_classify_many(featuresets)
|
||||
|
||||
NLTK defines several classifier classes:
|
||||
|
||||
- `ConditionalExponentialClassifier`
|
||||
- `DecisionTreeClassifier`
|
||||
- `MaxentClassifier`
|
||||
- `NaiveBayesClassifier`
|
||||
- `WekaClassifier`
|
||||
|
||||
Classifiers are typically created by training them on a training
|
||||
corpus.
|
||||
|
||||
|
||||
Regression Tests
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
We define a very simple training corpus with 3 binary features: ['a',
|
||||
'b', 'c'], and are two labels: ['x', 'y']. We use a simple feature set so
|
||||
that the correct answers can be calculated analytically (although we
|
||||
haven't done this yet for all tests).
|
||||
|
||||
>>> import nltk
|
||||
>>> train = [
|
||||
... (dict(a=1,b=1,c=1), 'y'),
|
||||
... (dict(a=1,b=1,c=1), 'x'),
|
||||
... (dict(a=1,b=1,c=0), 'y'),
|
||||
... (dict(a=0,b=1,c=1), 'x'),
|
||||
... (dict(a=0,b=1,c=1), 'y'),
|
||||
... (dict(a=0,b=0,c=1), 'y'),
|
||||
... (dict(a=0,b=1,c=0), 'x'),
|
||||
... (dict(a=0,b=0,c=0), 'x'),
|
||||
... (dict(a=0,b=1,c=1), 'y'),
|
||||
... (dict(a=None,b=1,c=0), 'x'),
|
||||
... ]
|
||||
>>> test = [
|
||||
... (dict(a=1,b=0,c=1)), # unseen
|
||||
... (dict(a=1,b=0,c=0)), # unseen
|
||||
... (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x
|
||||
... (dict(a=0,b=1,c=0)), # seen 1 time, label=x
|
||||
... ]
|
||||
|
||||
Test the Naive Bayes classifier:
|
||||
|
||||
>>> classifier = nltk.classify.NaiveBayesClassifier.train(train)
|
||||
>>> sorted(classifier.labels())
|
||||
['x', 'y']
|
||||
>>> classifier.classify_many(test)
|
||||
['y', 'x', 'y', 'x']
|
||||
>>> for pdist in classifier.prob_classify_many(test):
|
||||
... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
|
||||
0.2500 0.7500
|
||||
0.5833 0.4167
|
||||
0.3571 0.6429
|
||||
0.7000 0.3000
|
||||
>>> classifier.show_most_informative_features()
|
||||
Most Informative Features
|
||||
c = 0 x : y = 2.3 : 1.0
|
||||
c = 1 y : x = 1.8 : 1.0
|
||||
a = 1 y : x = 1.7 : 1.0
|
||||
a = 0 x : y = 1.0 : 1.0
|
||||
b = 0 x : y = 1.0 : 1.0
|
||||
b = 1 x : y = 1.0 : 1.0
|
||||
|
||||
Test the Decision Tree classifier (without None):
|
||||
|
||||
>>> classifier = nltk.classify.DecisionTreeClassifier.train(
|
||||
... train[:-1], entropy_cutoff=0,
|
||||
... support_cutoff=0)
|
||||
>>> sorted(classifier.labels())
|
||||
['x', 'y']
|
||||
>>> print(classifier)
|
||||
c=0? .................................................. x
|
||||
a=0? ................................................ x
|
||||
a=1? ................................................ y
|
||||
c=1? .................................................. y
|
||||
<BLANKLINE>
|
||||
>>> classifier.classify_many(test)
|
||||
['y', 'y', 'y', 'x']
|
||||
>>> for pdist in classifier.prob_classify_many(test):
|
||||
... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y')))
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
|
||||
|
||||
Test the Decision Tree classifier (with None):
|
||||
|
||||
>>> classifier = nltk.classify.DecisionTreeClassifier.train(
|
||||
... train, entropy_cutoff=0,
|
||||
... support_cutoff=0)
|
||||
>>> sorted(classifier.labels())
|
||||
['x', 'y']
|
||||
>>> print(classifier)
|
||||
c=0? .................................................. x
|
||||
a=0? ................................................ x
|
||||
a=1? ................................................ y
|
||||
a=None? ............................................. x
|
||||
c=1? .................................................. y
|
||||
<BLANKLINE>
|
||||
|
||||
|
||||
Test SklearnClassifier, which requires the scikit-learn package.
|
||||
|
||||
>>> from nltk.classify import SklearnClassifier
|
||||
>>> from sklearn.naive_bayes import BernoulliNB
|
||||
>>> from sklearn.svm import SVC
|
||||
>>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"),
|
||||
... ({"a": 5, "b": 2, "c": 1}, "ham"),
|
||||
... ({"a": 0, "b": 3, "c": 4}, "spam"),
|
||||
... ({"a": 5, "b": 1, "c": 1}, "ham"),
|
||||
... ({"a": 1, "b": 4, "c": 3}, "spam")]
|
||||
>>> classif = SklearnClassifier(BernoulliNB()).train(train_data)
|
||||
>>> test_data = [{"a": 3, "b": 2, "c": 1},
|
||||
... {"a": 0, "b": 3, "c": 7}]
|
||||
>>> classif.classify_many(test_data)
|
||||
['ham', 'spam']
|
||||
>>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data)
|
||||
>>> classif.classify_many(test_data)
|
||||
['ham', 'spam']
|
||||
|
||||
Test the Maximum Entropy classifier training algorithms; they should all
|
||||
generate the same results.
|
||||
|
||||
>>> def print_maxent_test_header():
|
||||
... print(' '*11+''.join([' test[%s] ' % i
|
||||
... for i in range(len(test))]))
|
||||
... print(' '*11+' p(x) p(y)'*len(test))
|
||||
... print('-'*(11+15*len(test)))
|
||||
|
||||
>>> def test_maxent(algorithm):
|
||||
... print('%11s' % algorithm, end=' ')
|
||||
... try:
|
||||
... classifier = nltk.classify.MaxentClassifier.train(
|
||||
... train, algorithm, trace=0, max_iter=1000)
|
||||
... except Exception as e:
|
||||
... print('Error: %r' % e)
|
||||
... return
|
||||
...
|
||||
... for featureset in test:
|
||||
... pdist = classifier.prob_classify(featureset)
|
||||
... print('%8.2f%6.2f' % (pdist.prob('x'), pdist.prob('y')), end=' ')
|
||||
... print()
|
||||
|
||||
>>> print_maxent_test_header(); test_maxent('GIS'); test_maxent('IIS')
|
||||
test[0] test[1] test[2] test[3]
|
||||
p(x) p(y) p(x) p(y) p(x) p(y) p(x) p(y)
|
||||
-----------------------------------------------------------------------
|
||||
GIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
|
||||
IIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
|
||||
|
||||
>>> test_maxent('MEGAM'); test_maxent('TADM') # doctest: +SKIP
|
||||
MEGAM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
|
||||
TADM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24
|
||||
|
||||
|
||||
|
||||
Regression tests for TypedMaxentFeatureEncoding
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
>>> from nltk.classify import maxent
|
||||
>>> train = [
|
||||
... ({'a': 1, 'b': 1, 'c': 1}, 'y'),
|
||||
... ({'a': 5, 'b': 5, 'c': 5}, 'x'),
|
||||
... ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'),
|
||||
... ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'),
|
||||
... ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'),
|
||||
... ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x')
|
||||
... ]
|
||||
|
||||
>>> test = [
|
||||
... {'a': 1, 'b': 0.8, 'c': 1.2},
|
||||
... {'a': 5.2, 'b': 5.1, 'c': 5}
|
||||
... ]
|
||||
|
||||
>>> encoding = maxent.TypedMaxentFeatureEncoding.train(
|
||||
... train, count_cutoff=3, alwayson_features=True)
|
||||
|
||||
>>> classifier = maxent.MaxentClassifier.train(
|
||||
... train, bernoulli=False, encoding=encoding, trace=0)
|
||||
|
||||
>>> classifier.classify_many(test)
|
||||
['y', 'x']
|
||||
@@ -0,0 +1,5 @@
|
||||
# most of classify.doctest requires numpy
|
||||
def setup_module():
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("numpy")
|
||||
@@ -0,0 +1,31 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===========
|
||||
Collections
|
||||
===========
|
||||
|
||||
>>> import nltk
|
||||
>>> from nltk.collections import *
|
||||
|
||||
Trie
|
||||
----
|
||||
|
||||
Trie can be pickled:
|
||||
|
||||
>>> import pickle
|
||||
>>> trie = nltk.collections.Trie(['a'])
|
||||
>>> s = pickle.dumps(trie)
|
||||
>>> pickle.loads(s)
|
||||
{'a': {True: None}}
|
||||
|
||||
LazyIteratorList
|
||||
----------------
|
||||
|
||||
Fetching the length of a LazyIteratorList object does not throw a StopIteration exception:
|
||||
|
||||
>>> lil = LazyIteratorList(i for i in range(1, 11))
|
||||
>>> lil[-1]
|
||||
10
|
||||
>>> len(lil)
|
||||
10
|
||||
@@ -0,0 +1,307 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==============
|
||||
Collocations
|
||||
==============
|
||||
|
||||
Overview
|
||||
~~~~~~~~
|
||||
|
||||
Collocations are expressions of multiple words which commonly co-occur. For
|
||||
example, the top ten bigram collocations in Genesis are listed below, as
|
||||
measured using Pointwise Mutual Information.
|
||||
|
||||
>>> import nltk
|
||||
>>> from nltk.collocations import *
|
||||
>>> bigram_measures = nltk.collocations.BigramAssocMeasures()
|
||||
>>> trigram_measures = nltk.collocations.TrigramAssocMeasures()
|
||||
>>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures()
|
||||
>>> finder = BigramCollocationFinder.from_words(
|
||||
... nltk.corpus.genesis.words('english-web.txt'))
|
||||
>>> finder.nbest(bigram_measures.pmi, 10)
|
||||
[('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'),
|
||||
('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'),
|
||||
('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'),
|
||||
('cutting', 'instrument')]
|
||||
|
||||
While these words are highly collocated, the expressions are also very
|
||||
infrequent. Therefore it is useful to apply filters, such as ignoring all
|
||||
bigrams which occur less than three times in the corpus:
|
||||
|
||||
>>> finder.apply_freq_filter(3)
|
||||
>>> finder.nbest(bigram_measures.pmi, 10)
|
||||
[('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'),
|
||||
('ewe', 'lambs'), ('Most', 'High'), ('many', 'colors'),
|
||||
('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'),
|
||||
('living', 'creature')]
|
||||
|
||||
We may similarly find collocations among tagged words:
|
||||
|
||||
>>> finder = BigramCollocationFinder.from_words(
|
||||
... nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
|
||||
>>> finder.nbest(bigram_measures.pmi, 5)
|
||||
[(('1,119', 'NUM'), ('votes', 'NOUN')),
|
||||
(('1962', 'NUM'), ("governor's", 'NOUN')),
|
||||
(('637', 'NUM'), ('E.', 'NOUN')),
|
||||
(('Alpharetta', 'NOUN'), ('prison', 'NOUN')),
|
||||
(('Bar', 'NOUN'), ('Association', 'NOUN'))]
|
||||
|
||||
Or tags alone:
|
||||
|
||||
>>> finder = BigramCollocationFinder.from_words(t for w, t in
|
||||
... nltk.corpus.brown.tagged_words('ca01', tagset='universal'))
|
||||
>>> finder.nbest(bigram_measures.pmi, 10)
|
||||
[('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'),
|
||||
('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')]
|
||||
|
||||
Or spanning intervening words:
|
||||
|
||||
>>> finder = BigramCollocationFinder.from_words(
|
||||
... nltk.corpus.genesis.words('english-web.txt'),
|
||||
... window_size = 20)
|
||||
>>> finder.apply_freq_filter(2)
|
||||
>>> ignored_words = nltk.corpus.stopwords.words('english')
|
||||
>>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
|
||||
>>> finder.nbest(bigram_measures.likelihood_ratio, 10)
|
||||
[('chief', 'chief'), ('became', 'father'), ('years', 'became'),
|
||||
('hundred', 'years'), ('lived', 'became'), ('king', 'king'),
|
||||
('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'),
|
||||
('hundred', 'became')]
|
||||
|
||||
Finders
|
||||
~~~~~~~
|
||||
|
||||
The collocations package provides collocation finders which by default
|
||||
consider all ngrams in a text as candidate collocations:
|
||||
|
||||
>>> text = "I do not like green eggs and ham, I do not like them Sam I am!"
|
||||
>>> tokens = nltk.wordpunct_tokenize(text)
|
||||
>>> finder = BigramCollocationFinder.from_words(tokens)
|
||||
>>> scored = finder.score_ngrams(bigram_measures.raw_freq)
|
||||
>>> sorted(bigram for bigram, score in scored)
|
||||
[(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'),
|
||||
('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'),
|
||||
('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'),
|
||||
('them', 'Sam')]
|
||||
|
||||
We could otherwise construct the collocation finder from manually-derived
|
||||
FreqDists:
|
||||
|
||||
>>> word_fd = nltk.FreqDist(tokens)
|
||||
>>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens))
|
||||
>>> finder = BigramCollocationFinder(word_fd, bigram_fd)
|
||||
>>> scored == finder.score_ngrams(bigram_measures.raw_freq)
|
||||
True
|
||||
|
||||
A similar interface is provided for trigrams:
|
||||
|
||||
>>> finder = TrigramCollocationFinder.from_words(tokens)
|
||||
>>> scored = finder.score_ngrams(trigram_measures.raw_freq)
|
||||
>>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens))
|
||||
True
|
||||
|
||||
We may want to select only the top n results:
|
||||
|
||||
>>> sorted(finder.nbest(trigram_measures.raw_freq, 2))
|
||||
[('I', 'do', 'not'), ('do', 'not', 'like')]
|
||||
|
||||
Alternatively, we can select those above a minimum score value:
|
||||
|
||||
>>> sorted(finder.above_score(trigram_measures.raw_freq,
|
||||
... 1.0 / len(tuple(nltk.trigrams(tokens)))))
|
||||
[('I', 'do', 'not'), ('do', 'not', 'like')]
|
||||
|
||||
Now spanning intervening words:
|
||||
|
||||
>>> finder = TrigramCollocationFinder.from_words(tokens)
|
||||
>>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4)
|
||||
>>> sorted(finder.nbest(trigram_measures.raw_freq, 4))
|
||||
[('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')]
|
||||
|
||||
A closer look at the finder's ngram frequencies:
|
||||
|
||||
>>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10]
|
||||
[(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2),
|
||||
(('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1),
|
||||
((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1),
|
||||
(('Sam', 'I', 'am'), 1)]
|
||||
|
||||
A similar interface is provided for fourgrams:
|
||||
|
||||
>>> finder_4grams = QuadgramCollocationFinder.from_words(tokens)
|
||||
>>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq)
|
||||
>>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4))
|
||||
True
|
||||
|
||||
Filtering candidates
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
All the ngrams in a text are often too many to be useful when finding
|
||||
collocations. It is generally useful to remove some words or punctuation,
|
||||
and to require a minimum frequency for candidate collocations.
|
||||
|
||||
Given our sample text above, if we remove all trigrams containing personal
|
||||
pronouns from candidature, score_ngrams should return 6 less results, and
|
||||
'do not like' will be the only candidate which occurs more than once:
|
||||
|
||||
>>> finder = TrigramCollocationFinder.from_words(tokens)
|
||||
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
||||
14
|
||||
>>> finder.apply_word_filter(lambda w: w in ('I', 'me'))
|
||||
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
||||
8
|
||||
>>> sorted(finder.above_score(trigram_measures.raw_freq,
|
||||
... 1.0 / len(tuple(nltk.trigrams(tokens)))))
|
||||
[('do', 'not', 'like')]
|
||||
|
||||
Sometimes a filter is a function on the whole ngram, rather than each word,
|
||||
such as if we may permit 'and' to appear in the middle of a trigram, but
|
||||
not on either edge:
|
||||
|
||||
>>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3))
|
||||
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
||||
6
|
||||
|
||||
Finally, it is often important to remove low frequency candidates, as we
|
||||
lack sufficient evidence about their significance as collocations:
|
||||
|
||||
>>> finder.apply_freq_filter(2)
|
||||
>>> len(finder.score_ngrams(trigram_measures.raw_freq))
|
||||
1
|
||||
|
||||
Association measures
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
A number of measures are available to score collocations or other associations.
|
||||
The arguments to measure functions are marginals of a contingency table, in the
|
||||
bigram case (n_ii, (n_ix, n_xi), n_xx)::
|
||||
|
||||
w1 ~w1
|
||||
------ ------
|
||||
w2 | n_ii | n_oi | = n_xi
|
||||
------ ------
|
||||
~w2 | n_io | n_oo |
|
||||
------ ------
|
||||
= n_ix TOTAL = n_xx
|
||||
|
||||
We test their calculation using some known values presented in Manning and
|
||||
Schutze's text and other papers.
|
||||
|
||||
Student's t: examples from Manning and Schutze 5.3.2
|
||||
|
||||
>>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668))
|
||||
0.9999
|
||||
>>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668))
|
||||
4.4721
|
||||
|
||||
Chi-square: examples from Manning and Schutze 5.3.3
|
||||
|
||||
>>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668))
|
||||
1.55
|
||||
>>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007))
|
||||
456400
|
||||
|
||||
Likelihood ratios: examples from Dunning, CL, 1993
|
||||
|
||||
>>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777))
|
||||
270.72
|
||||
>>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777))
|
||||
95.29
|
||||
|
||||
Pointwise Mutual Information: examples from Manning and Schutze 5.4
|
||||
|
||||
>>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668))
|
||||
18.38
|
||||
>>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668))
|
||||
0.29
|
||||
|
||||
TODO: Find authoritative results for trigrams.
|
||||
|
||||
Using contingency table values
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
While frequency counts make marginals readily available for collocation
|
||||
finding, it is common to find published contingency table values. The
|
||||
collocations package therefore provides a wrapper, ContingencyMeasures, which
|
||||
wraps an association measures class, providing association measures which
|
||||
take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the
|
||||
bigram case.
|
||||
|
||||
>>> from nltk.metrics import ContingencyMeasures
|
||||
>>> cont_bigram_measures = ContingencyMeasures(bigram_measures)
|
||||
>>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740))
|
||||
95.29
|
||||
>>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173))
|
||||
1.55
|
||||
|
||||
Ranking and correlation
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
It is useful to consider the results of finding collocations as a ranking, and
|
||||
the rankings output using different association measures can be compared using
|
||||
the Spearman correlation coefficient.
|
||||
|
||||
Ranks can be assigned to a sorted list of results trivially by assigning
|
||||
strictly increasing ranks to each result:
|
||||
|
||||
>>> from nltk.metrics.spearman import *
|
||||
>>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5']
|
||||
>>> print(list(ranks_from_sequence(results_list)))
|
||||
[('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)]
|
||||
|
||||
If scores are available for each result, we may allow sufficiently similar
|
||||
results (differing by no more than rank_gap) to be assigned the same rank:
|
||||
|
||||
>>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0),
|
||||
... ('item4', 35.0), ('item5', 14.0)]
|
||||
>>> print(list(ranks_from_scores(results_scored, rank_gap=5)))
|
||||
[('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)]
|
||||
|
||||
The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing
|
||||
two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates
|
||||
exact opposite rankings.
|
||||
|
||||
>>> print('%0.1f' % spearman_correlation(
|
||||
... ranks_from_sequence(results_list),
|
||||
... ranks_from_sequence(results_list)))
|
||||
1.0
|
||||
>>> print('%0.1f' % spearman_correlation(
|
||||
... ranks_from_sequence(reversed(results_list)),
|
||||
... ranks_from_sequence(results_list)))
|
||||
-1.0
|
||||
>>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4']
|
||||
>>> print('%0.1f' % spearman_correlation(
|
||||
... ranks_from_sequence(results_list),
|
||||
... ranks_from_sequence(results_list2)))
|
||||
0.6
|
||||
>>> print('%0.1f' % spearman_correlation(
|
||||
... ranks_from_sequence(reversed(results_list)),
|
||||
... ranks_from_sequence(results_list2)))
|
||||
-0.6
|
||||
|
||||
Keywords
|
||||
~~~~~~~~
|
||||
|
||||
Bigram association metrics can also be used to perform keyword analysis. . For example, this finds the keywords
|
||||
associated with the "romance" section of the Brown corpus as measured by likelihood ratio:
|
||||
|
||||
>>> romance = nltk.FreqDist(w.lower() for w in nltk.corpus.brown.words(categories='romance') if w.isalpha())
|
||||
>>> freq = nltk.FreqDist(w.lower() for w in nltk.corpus.brown.words() if w.isalpha())
|
||||
|
||||
>>> key = nltk.FreqDist()
|
||||
>>> for w in romance:
|
||||
... key[w] = bigram_measures.likelihood_ratio(romance[w], (freq[w], romance.N()), freq.N())
|
||||
|
||||
>>> for k,v in key.most_common(10):
|
||||
... print(f'{k:10s} {v:9.3f}')
|
||||
she 1163.325
|
||||
i 995.961
|
||||
her 930.528
|
||||
you 513.149
|
||||
of 501.891
|
||||
is 463.386
|
||||
had 421.615
|
||||
he 411.000
|
||||
the 347.632
|
||||
said 300.811
|
||||
@@ -0,0 +1,75 @@
|
||||
.. Copyright (C) 2001-2016 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==================================
|
||||
Concordance Example
|
||||
==================================
|
||||
|
||||
A concordance view shows us every occurrence of a given
|
||||
word, together with some context. Here we look up the word monstrous
|
||||
in Moby Dick by entering text1 followed by a period, then the term
|
||||
concordance, and then placing "monstrous" in parentheses:
|
||||
|
||||
>>> from nltk.corpus import gutenberg
|
||||
>>> from nltk.text import Text
|
||||
>>> corpus = gutenberg.words('melville-moby_dick.txt')
|
||||
>>> text = Text(corpus)
|
||||
|
||||
>>> text.concordance("monstrous")
|
||||
Displaying 11 of 11 matches:
|
||||
ong the former , one was of a most monstrous size . ... This came towards us ,
|
||||
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
|
||||
ll over with a heathenish array of monstrous clubs and spears . Some were thick
|
||||
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
|
||||
that has survived the flood ; most monstrous and most mountainous ! That Himmal
|
||||
they might scout at Moby Dick as a monstrous fable , or still worse and more de
|
||||
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
|
||||
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
|
||||
ere to enter upon those still more monstrous stories of them which are to be fo
|
||||
ght have been rummaged out of this monstrous cabinet there is no telling . But
|
||||
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u
|
||||
|
||||
>>> text.concordance("monstrous")
|
||||
Displaying 11 of 11 matches:
|
||||
ong the former , one was of a most monstrous size . ... This came towards us ,
|
||||
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
|
||||
ll over with a heathenish array of monstrous clubs and spears . Some were thick
|
||||
...
|
||||
|
||||
We can also search for a multi-word phrase by passing a list of strings:
|
||||
|
||||
>>> text.concordance(["monstrous", "size"])
|
||||
Displaying 2 of 2 matches:
|
||||
the former , one was of a most monstrous size . ... This came towards us , op
|
||||
Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead upo
|
||||
|
||||
=================================
|
||||
Concordance List
|
||||
=================================
|
||||
|
||||
Often we need to store the results of concordance for further usage.
|
||||
To do so, call the concordance function with the stdout argument set
|
||||
to false:
|
||||
|
||||
>>> from nltk.corpus import gutenberg
|
||||
>>> from nltk.text import Text
|
||||
>>> corpus = gutenberg.words('melville-moby_dick.txt')
|
||||
>>> text = Text(corpus)
|
||||
>>> con_list = text.concordance_list("monstrous")
|
||||
>>> con_list[2].line
|
||||
'll over with a heathenish array of monstrous clubs and spears . Some were thick'
|
||||
>>> len(con_list)
|
||||
11
|
||||
|
||||
=================================
|
||||
Patching Issue #2088
|
||||
=================================
|
||||
|
||||
Patching https://github.com/nltk/nltk/issues/2088
|
||||
The left slice of the left context should be clip to 0 if the `i-context` < 0.
|
||||
|
||||
>>> from nltk import Text, word_tokenize
|
||||
>>> jane_eyre = 'Chapter 1\nTHERE was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further outdoor exercise was now out of the question.'
|
||||
>>> text = Text(word_tokenize(jane_eyre))
|
||||
>>> text.concordance_list('taking')[0].left
|
||||
['Chapter', '1', 'THERE', 'was', 'no', 'possibility', 'of']
|
||||
@@ -0,0 +1,33 @@
|
||||
import pytest
|
||||
|
||||
from nltk.corpus.reader import CorpusReader
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_plot(mocker):
|
||||
"""Disable matplotlib plotting in test code"""
|
||||
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
mocker.patch.object(plt, "gca")
|
||||
mocker.patch.object(plt, "show")
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def teardown_loaded_corpora():
|
||||
"""
|
||||
After each test session ends (either doctest or unit test),
|
||||
unload any loaded corpora
|
||||
"""
|
||||
|
||||
yield # first, wait for the test to end
|
||||
|
||||
import nltk.corpus
|
||||
|
||||
for name in dir(nltk.corpus):
|
||||
obj = getattr(nltk.corpus, name, None)
|
||||
if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"):
|
||||
obj._unload()
|
||||
2336
Backend/venv/lib/python3.12/site-packages/nltk/test/corpus.doctest
Normal file
2336
Backend/venv/lib/python3.12/site-packages/nltk/test/corpus.doctest
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1,65 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
Crubadan Corpus Reader
|
||||
======================
|
||||
|
||||
Crubadan is an NLTK corpus reader for ngram files provided
|
||||
by the Crubadan project. It supports several languages.
|
||||
|
||||
>>> from nltk.corpus import crubadan
|
||||
>>> crubadan.langs()
|
||||
['abk', 'abn',..., 'zpa', 'zul']
|
||||
|
||||
----------------------------------------
|
||||
Language code mapping and helper methods
|
||||
----------------------------------------
|
||||
|
||||
The web crawler that generates the 3-gram frequencies works at the
|
||||
level of "writing systems" rather than languages. Writing systems
|
||||
are assigned internal 2-3 letter codes that require mapping to the
|
||||
standard ISO 639-3 codes. For more information, please refer to
|
||||
the README in nltk_data/crubadan folder after installing it.
|
||||
|
||||
To translate ISO 639-3 codes to "Crubadan Code":
|
||||
|
||||
>>> crubadan.iso_to_crubadan('eng')
|
||||
'en'
|
||||
>>> crubadan.iso_to_crubadan('fra')
|
||||
'fr'
|
||||
>>> crubadan.iso_to_crubadan('aaa')
|
||||
|
||||
In reverse, print ISO 639-3 code if we have the Crubadan Code:
|
||||
|
||||
>>> crubadan.crubadan_to_iso('en')
|
||||
'eng'
|
||||
>>> crubadan.crubadan_to_iso('fr')
|
||||
'fra'
|
||||
>>> crubadan.crubadan_to_iso('aa')
|
||||
|
||||
---------------------------
|
||||
Accessing ngram frequencies
|
||||
---------------------------
|
||||
|
||||
On initialization the reader will create a dictionary of every
|
||||
language supported by the Crubadan project, mapping the ISO 639-3
|
||||
language code to its corresponding ngram frequency.
|
||||
|
||||
You can access individual language FreqDist and the ngrams within them as follows:
|
||||
|
||||
>>> english_fd = crubadan.lang_freq('eng')
|
||||
>>> english_fd['the']
|
||||
728135
|
||||
|
||||
Above accesses the FreqDist of English and returns the frequency of the ngram 'the'.
|
||||
A ngram that isn't found within the language will return 0:
|
||||
|
||||
>>> english_fd['sometest']
|
||||
0
|
||||
|
||||
A language that isn't supported will raise an exception:
|
||||
|
||||
>>> crubadan.lang_freq('elvish')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
RuntimeError: Unsupported language.
|
||||
390
Backend/venv/lib/python3.12/site-packages/nltk/test/data.doctest
Normal file
390
Backend/venv/lib/python3.12/site-packages/nltk/test/data.doctest
Normal file
@@ -0,0 +1,390 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=========================================
|
||||
Loading Resources From the Data Package
|
||||
=========================================
|
||||
|
||||
>>> import nltk.data
|
||||
|
||||
Overview
|
||||
~~~~~~~~
|
||||
The `nltk.data` module contains functions that can be used to load
|
||||
NLTK resource files, such as corpora, grammars, and saved processing
|
||||
objects.
|
||||
|
||||
Loading Data Files
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
Resources are loaded using the function `nltk.data.load()`, which
|
||||
takes as its first argument a URL specifying what file should be
|
||||
loaded. The ``nltk:`` protocol loads files from the NLTK data
|
||||
distribution.
|
||||
|
||||
However, since July 2024, unpickling is restricted to simple types,
|
||||
and now fails with a pickle.Unpickling Error.
|
||||
Instead, all the unsafe pickle packages are now replaced by classes:
|
||||
|
||||
>>> from nltk.tokenize import PunktTokenizer
|
||||
>>> tokenizer = PunktTokenizer()
|
||||
|
||||
>>> tokenizer.tokenize('Hello. This is a test. It works!')
|
||||
['Hello.', 'This is a test.', 'It works!']
|
||||
|
||||
It is important to note that there should be no space following the
|
||||
colon (':') in the URL; 'nltk: tokenizers/punkt/english.pickle' will
|
||||
not work!
|
||||
|
||||
The ``nltk:`` protocol is used by default if no protocol is specified.
|
||||
|
||||
But it is also possible to load resources from ``http:``, ``ftp:``,
|
||||
and ``file:`` URLs:
|
||||
|
||||
>>> # Load a grammar from the NLTK webpage.
|
||||
>>> cfg = nltk.data.load('https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg')
|
||||
>>> print(cfg) # doctest: +ELLIPSIS
|
||||
Grammar with 14 productions (start state = S)
|
||||
S -> NP VP
|
||||
PP -> P NP
|
||||
...
|
||||
P -> 'on'
|
||||
P -> 'in'
|
||||
|
||||
>>> # Load a grammar using an absolute path.
|
||||
>>> url = 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg')
|
||||
>>> url.replace('\\', '/')
|
||||
'file:...toy.cfg'
|
||||
>>> print(nltk.data.load(url))
|
||||
Grammar with 14 productions (start state = S)
|
||||
S -> NP VP
|
||||
PP -> P NP
|
||||
...
|
||||
P -> 'on'
|
||||
P -> 'in'
|
||||
|
||||
The second argument to the `nltk.data.load()` function specifies the
|
||||
file format, which determines how the file's contents are processed
|
||||
before they are returned by ``load()``. The formats that are
|
||||
currently supported by the data module are described by the dictionary
|
||||
`nltk.data.FORMATS`:
|
||||
|
||||
>>> for format, descr in sorted(nltk.data.FORMATS.items()):
|
||||
... print('{0:<7} {1:}'.format(format, descr))
|
||||
cfg A context free grammar.
|
||||
fcfg A feature CFG.
|
||||
fol A list of first order logic expressions, parsed with
|
||||
nltk.sem.logic.Expression.fromstring.
|
||||
json A serialized python object, stored using the json module.
|
||||
logic A list of first order logic expressions, parsed with
|
||||
nltk.sem.logic.LogicParser. Requires an additional logic_parser
|
||||
parameter
|
||||
pcfg A probabilistic CFG.
|
||||
pickle A serialized python object, stored using the pickle
|
||||
module.
|
||||
raw The raw (byte string) contents of a file.
|
||||
text The raw (unicode string) contents of a file.
|
||||
val A semantic valuation, parsed by
|
||||
nltk.sem.Valuation.fromstring.
|
||||
yaml A serialized python object, stored using the yaml module.
|
||||
|
||||
`nltk.data.load()` will raise a ValueError if a bad format name is
|
||||
specified:
|
||||
|
||||
>>> nltk.data.load('grammars/sample_grammars/toy.cfg', 'bar')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Unknown format type!
|
||||
|
||||
By default, the ``"auto"`` format is used, which chooses a format
|
||||
based on the filename's extension. The mapping from file extensions
|
||||
to format names is specified by `nltk.data.AUTO_FORMATS`:
|
||||
|
||||
>>> for ext, format in sorted(nltk.data.AUTO_FORMATS.items()):
|
||||
... print('.%-7s -> %s' % (ext, format))
|
||||
.cfg -> cfg
|
||||
.fcfg -> fcfg
|
||||
.fol -> fol
|
||||
.json -> json
|
||||
.logic -> logic
|
||||
.pcfg -> pcfg
|
||||
.pickle -> pickle
|
||||
.text -> text
|
||||
.txt -> text
|
||||
.val -> val
|
||||
.yaml -> yaml
|
||||
|
||||
If `nltk.data.load()` is unable to determine the format based on the
|
||||
filename's extension, it will raise a ValueError:
|
||||
|
||||
>>> nltk.data.load('foo.bar')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Could not determine format for foo.bar based on its file
|
||||
extension; use the "format" argument to specify the format explicitly.
|
||||
|
||||
Note that by explicitly specifying the ``format`` argument, you can
|
||||
override the load method's default processing behavior. For example,
|
||||
to get the raw contents of any file, simply use ``format="raw"``:
|
||||
|
||||
>>> s = nltk.data.load('grammars/sample_grammars/toy.cfg', 'text')
|
||||
>>> print(s)
|
||||
S -> NP VP
|
||||
PP -> P NP
|
||||
NP -> Det N | NP PP
|
||||
VP -> V NP | VP PP
|
||||
...
|
||||
|
||||
Making Local Copies
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
.. This will not be visible in the html output: create a tempdir to
|
||||
play in.
|
||||
>>> import tempfile, os
|
||||
>>> tempdir = tempfile.mkdtemp()
|
||||
>>> old_dir = os.path.abspath('.')
|
||||
>>> os.chdir(tempdir)
|
||||
|
||||
The function `nltk.data.retrieve()` copies a given resource to a local
|
||||
file. This can be useful, for example, if you want to edit one of the
|
||||
sample grammars.
|
||||
|
||||
>>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg')
|
||||
Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy.cfg'
|
||||
|
||||
>>> # Simulate editing the grammar.
|
||||
>>> with open('toy.cfg') as inp:
|
||||
... s = inp.read().replace('NP', 'DP')
|
||||
>>> with open('toy.cfg', 'w') as out:
|
||||
... _bytes_written = out.write(s)
|
||||
|
||||
>>> # Load the edited grammar, & display it.
|
||||
>>> cfg = nltk.data.load('file:///' + os.path.abspath('toy.cfg'))
|
||||
>>> print(cfg)
|
||||
Grammar with 14 productions (start state = S)
|
||||
S -> DP VP
|
||||
PP -> P DP
|
||||
...
|
||||
P -> 'on'
|
||||
P -> 'in'
|
||||
|
||||
The second argument to `nltk.data.retrieve()` specifies the filename
|
||||
for the new copy of the file. By default, the source file's filename
|
||||
is used.
|
||||
|
||||
>>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg', 'mytoy.cfg')
|
||||
Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'mytoy.cfg'
|
||||
>>> os.path.isfile('./mytoy.cfg')
|
||||
True
|
||||
>>> nltk.data.retrieve('grammars/sample_grammars/np.fcfg')
|
||||
Retrieving 'nltk:grammars/sample_grammars/np.fcfg', saving to 'np.fcfg'
|
||||
>>> os.path.isfile('./np.fcfg')
|
||||
True
|
||||
|
||||
If a file with the specified (or default) filename already exists in
|
||||
the current directory, then `nltk.data.retrieve()` will raise a
|
||||
ValueError exception. It will *not* overwrite the file:
|
||||
|
||||
>>> os.path.isfile('./toy.cfg')
|
||||
True
|
||||
>>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: File '...toy.cfg' already exists!
|
||||
|
||||
.. This will not be visible in the html output: clean up the tempdir.
|
||||
>>> os.chdir(old_dir)
|
||||
>>> for f in os.listdir(tempdir):
|
||||
... os.remove(os.path.join(tempdir, f))
|
||||
>>> os.rmdir(tempdir)
|
||||
|
||||
Finding Files in the NLTK Data Package
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
The `nltk.data.find()` function searches the NLTK data package for a
|
||||
given file, and returns a pointer to that file. This pointer can
|
||||
either be a `FileSystemPathPointer` (whose `path` attribute gives the
|
||||
absolute path of the file); or a `ZipFilePathPointer`, specifying a
|
||||
zipfile and the name of an entry within that zipfile. Both pointer
|
||||
types define the `open()` method, which can be used to read the string
|
||||
contents of the file.
|
||||
|
||||
>>> path = nltk.data.find('corpora/abc/rural.txt')
|
||||
>>> str(path)
|
||||
'...rural.txt'
|
||||
>>> print(path.open().read(60).decode())
|
||||
PM denies knowledge of AWB kickbacks
|
||||
The Prime Minister has
|
||||
|
||||
Alternatively, the `nltk.data.load()` function can be used with the
|
||||
keyword argument ``format="raw"``:
|
||||
|
||||
>>> s = nltk.data.load('corpora/abc/rural.txt', format='raw')[:60]
|
||||
>>> print(s.decode())
|
||||
PM denies knowledge of AWB kickbacks
|
||||
The Prime Minister has
|
||||
|
||||
Alternatively, you can use the keyword argument ``format="text"``:
|
||||
|
||||
>>> s = nltk.data.load('corpora/abc/rural.txt', format='text')[:60]
|
||||
>>> print(s)
|
||||
PM denies knowledge of AWB kickbacks
|
||||
The Prime Minister has
|
||||
|
||||
Resource Caching
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
NLTK uses a weakref dictionary to maintain a cache of resources that
|
||||
have been loaded. If you load a resource that is already stored in
|
||||
the cache, then the cached copy will be returned. This behavior can
|
||||
be seen by the trace output generated when verbose=True:
|
||||
|
||||
>>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True)
|
||||
<<Loading nltk:grammars/book_grammars/feat0.fcfg>>
|
||||
>>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True)
|
||||
<<Using cached copy of nltk:grammars/book_grammars/feat0.fcfg>>
|
||||
|
||||
If you wish to load a resource from its source, bypassing the cache,
|
||||
use the ``cache=False`` argument to `nltk.data.load()`. This can be
|
||||
useful, for example, if the resource is loaded from a local file, and
|
||||
you are actively editing that file:
|
||||
|
||||
>>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',cache=False,verbose=True)
|
||||
<<Loading nltk:grammars/book_grammars/feat0.fcfg>>
|
||||
|
||||
The cache *no longer* uses weak references. A resource will not be
|
||||
automatically expunged from the cache when no more objects are using
|
||||
it. In the following example, when we clear the variable ``feat0``,
|
||||
the reference count for the feature grammar object drops to zero.
|
||||
However, the object remains cached:
|
||||
|
||||
>>> del feat0
|
||||
>>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',
|
||||
... verbose=True)
|
||||
<<Using cached copy of nltk:grammars/book_grammars/feat0.fcfg>>
|
||||
|
||||
You can clear the entire contents of the cache, using
|
||||
`nltk.data.clear_cache()`:
|
||||
|
||||
>>> nltk.data.clear_cache()
|
||||
|
||||
Retrieving other Data Sources
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
>>> formulas = nltk.data.load('grammars/book_grammars/background.fol')
|
||||
>>> for f in formulas: print(str(f))
|
||||
all x.(boxerdog(x) -> dog(x))
|
||||
all x.(boxer(x) -> person(x))
|
||||
all x.-(dog(x) & person(x))
|
||||
all x.(married(x) <-> exists y.marry(x,y))
|
||||
all x.(bark(x) -> dog(x))
|
||||
all x y.(marry(x,y) -> (person(x) & person(y)))
|
||||
-(Vincent = Mia)
|
||||
-(Vincent = Fido)
|
||||
-(Mia = Fido)
|
||||
|
||||
Regression Tests
|
||||
~~~~~~~~~~~~~~~~
|
||||
Create a temp dir for tests that write files:
|
||||
|
||||
>>> import tempfile, os
|
||||
>>> tempdir = tempfile.mkdtemp()
|
||||
>>> old_dir = os.path.abspath('.')
|
||||
>>> os.chdir(tempdir)
|
||||
|
||||
The `retrieve()` function accepts all url types:
|
||||
|
||||
>>> urls = ['https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg',
|
||||
... 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg'),
|
||||
... 'nltk:grammars/sample_grammars/toy.cfg',
|
||||
... 'grammars/sample_grammars/toy.cfg']
|
||||
>>> for i, url in enumerate(urls):
|
||||
... nltk.data.retrieve(url, 'toy-%d.cfg' % i)
|
||||
Retrieving 'https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', saving to 'toy-0.cfg'
|
||||
Retrieving 'file:...toy.cfg', saving to 'toy-1.cfg'
|
||||
Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-2.cfg'
|
||||
Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-3.cfg'
|
||||
|
||||
Clean up the temp dir:
|
||||
|
||||
>>> os.chdir(old_dir)
|
||||
>>> for f in os.listdir(tempdir):
|
||||
... os.remove(os.path.join(tempdir, f))
|
||||
>>> os.rmdir(tempdir)
|
||||
|
||||
Lazy Loader
|
||||
-----------
|
||||
A lazy loader is a wrapper object that defers loading a resource until
|
||||
it is accessed or used in any way. This is mainly intended for
|
||||
internal use by NLTK's corpus readers.
|
||||
|
||||
>>> # Create a lazy loader for toy.cfg.
|
||||
>>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg')
|
||||
|
||||
>>> # Show that it's not loaded yet:
|
||||
>>> object.__repr__(ll)
|
||||
'<nltk.data.LazyLoader object at ...>'
|
||||
|
||||
>>> # printing it is enough to cause it to be loaded:
|
||||
>>> print(ll)
|
||||
<Grammar with 14 productions>
|
||||
|
||||
>>> # Show that it's now been loaded:
|
||||
>>> object.__repr__(ll)
|
||||
'<nltk.grammar.CFG object at ...>'
|
||||
|
||||
|
||||
>>> # Test that accessing an attribute also loads it:
|
||||
>>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg')
|
||||
>>> ll.start()
|
||||
S
|
||||
>>> object.__repr__(ll)
|
||||
'<nltk.grammar.CFG object at ...>'
|
||||
|
||||
Buffered Gzip Reading and Writing
|
||||
---------------------------------
|
||||
Write performance to gzip-compressed is extremely poor when the files become large.
|
||||
File creation can become a bottleneck in those cases.
|
||||
|
||||
Read performance from large gzipped pickle files was improved in data.py by
|
||||
buffering the reads. A similar fix can be applied to writes by buffering
|
||||
the writes to a StringIO object first.
|
||||
|
||||
This is mainly intended for internal use. The test simply tests that reading
|
||||
and writing work as intended and does not test how much improvement buffering
|
||||
provides.
|
||||
|
||||
>>> from io import StringIO
|
||||
>>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10)
|
||||
>>> ans = []
|
||||
>>> for i in range(10000):
|
||||
... ans.append(str(i).encode('ascii'))
|
||||
... test.write(str(i).encode('ascii'))
|
||||
>>> test.close()
|
||||
>>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'rb')
|
||||
>>> test.read() == b''.join(ans)
|
||||
True
|
||||
>>> test.close()
|
||||
>>> import os
|
||||
>>> os.unlink('testbuf.gz')
|
||||
|
||||
JSON Encoding and Decoding
|
||||
--------------------------
|
||||
JSON serialization is used instead of pickle for some classes.
|
||||
|
||||
>>> from nltk import jsontags
|
||||
>>> from nltk.jsontags import JSONTaggedEncoder, JSONTaggedDecoder, register_tag
|
||||
>>> @jsontags.register_tag
|
||||
... class JSONSerializable:
|
||||
... json_tag = 'JSONSerializable'
|
||||
...
|
||||
... def __init__(self, n):
|
||||
... self.n = n
|
||||
...
|
||||
... def encode_json_obj(self):
|
||||
... return self.n
|
||||
...
|
||||
... @classmethod
|
||||
... def decode_json_obj(cls, obj):
|
||||
... n = obj
|
||||
... return cls(n)
|
||||
...
|
||||
>>> JSONTaggedEncoder().encode(JSONSerializable(1))
|
||||
'{"!JSONSerializable": 1}'
|
||||
>>> JSONTaggedDecoder().decode('{"!JSONSerializable": 1}').n
|
||||
1
|
||||
241
Backend/venv/lib/python3.12/site-packages/nltk/test/dependency.doctest
Executable file
241
Backend/venv/lib/python3.12/site-packages/nltk/test/dependency.doctest
Executable file
@@ -0,0 +1,241 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===================
|
||||
Dependency Grammars
|
||||
===================
|
||||
|
||||
>>> from nltk.grammar import DependencyGrammar
|
||||
>>> from nltk.parse import (
|
||||
... DependencyGraph,
|
||||
... ProjectiveDependencyParser,
|
||||
... NonprojectiveDependencyParser,
|
||||
... )
|
||||
|
||||
CoNLL Data
|
||||
----------
|
||||
|
||||
>>> treebank_data = """Pierre NNP 2 NMOD
|
||||
... Vinken NNP 8 SUB
|
||||
... , , 2 P
|
||||
... 61 CD 5 NMOD
|
||||
... years NNS 6 AMOD
|
||||
... old JJ 2 NMOD
|
||||
... , , 2 P
|
||||
... will MD 0 ROOT
|
||||
... join VB 8 VC
|
||||
... the DT 11 NMOD
|
||||
... board NN 9 OBJ
|
||||
... as IN 9 VMOD
|
||||
... a DT 15 NMOD
|
||||
... nonexecutive JJ 15 NMOD
|
||||
... director NN 12 PMOD
|
||||
... Nov. NNP 9 VMOD
|
||||
... 29 CD 16 NMOD
|
||||
... . . 9 VMOD
|
||||
... """
|
||||
|
||||
>>> dg = DependencyGraph(treebank_data)
|
||||
>>> dg.tree().pprint()
|
||||
(will
|
||||
(Vinken Pierre , (old (years 61)) ,)
|
||||
(join (board the) (as (director a nonexecutive)) (Nov. 29) .))
|
||||
>>> for head, rel, dep in dg.triples():
|
||||
... print(
|
||||
... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})'
|
||||
... .format(h=head, r=rel, d=dep)
|
||||
... )
|
||||
(will, MD), SUB, (Vinken, NNP)
|
||||
(Vinken, NNP), NMOD, (Pierre, NNP)
|
||||
(Vinken, NNP), P, (,, ,)
|
||||
(Vinken, NNP), NMOD, (old, JJ)
|
||||
(old, JJ), AMOD, (years, NNS)
|
||||
(years, NNS), NMOD, (61, CD)
|
||||
(Vinken, NNP), P, (,, ,)
|
||||
(will, MD), VC, (join, VB)
|
||||
(join, VB), OBJ, (board, NN)
|
||||
(board, NN), NMOD, (the, DT)
|
||||
(join, VB), VMOD, (as, IN)
|
||||
(as, IN), PMOD, (director, NN)
|
||||
(director, NN), NMOD, (a, DT)
|
||||
(director, NN), NMOD, (nonexecutive, JJ)
|
||||
(join, VB), VMOD, (Nov., NNP)
|
||||
(Nov., NNP), NMOD, (29, CD)
|
||||
(join, VB), VMOD, (., .)
|
||||
|
||||
Using a custom cell extractor.
|
||||
|
||||
>>> def custom_extractor(cells):
|
||||
... _, tag, head, rel = cells
|
||||
... return 'spam', 'spam', tag, tag, '', head, rel
|
||||
>>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
|
||||
>>> dg.tree().pprint()
|
||||
(spam
|
||||
(spam spam spam (spam (spam spam)) spam)
|
||||
(spam (spam spam) (spam (spam spam spam)) (spam spam) spam))
|
||||
|
||||
Custom cell extractors can take in and return an index.
|
||||
|
||||
>>> def custom_extractor(cells, index):
|
||||
... word, tag, head, rel = cells
|
||||
... return (index, '{}-{}'.format(word, index), word,
|
||||
... tag, tag, '', head, rel)
|
||||
>>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor)
|
||||
>>> dg.tree().pprint()
|
||||
(will-8
|
||||
(Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7)
|
||||
(join-9
|
||||
(board-11 the-10)
|
||||
(as-12 (director-15 a-13 nonexecutive-14))
|
||||
(Nov.-16 29-17)
|
||||
.-18))
|
||||
|
||||
Using the dependency-parsed version of the Penn Treebank corpus sample.
|
||||
|
||||
>>> from nltk.corpus import dependency_treebank
|
||||
>>> t = dependency_treebank.parsed_sents()[0]
|
||||
>>> print(t.to_conll(3))
|
||||
Pierre NNP 2
|
||||
Vinken NNP 8
|
||||
, , 2
|
||||
61 CD 5
|
||||
years NNS 6
|
||||
old JJ 2
|
||||
, , 2
|
||||
will MD 0
|
||||
join VB 8
|
||||
the DT 11
|
||||
board NN 9
|
||||
as IN 9
|
||||
a DT 15
|
||||
nonexecutive JJ 15
|
||||
director NN 12
|
||||
Nov. NNP 9
|
||||
29 CD 16
|
||||
. . 8
|
||||
|
||||
Using the output of zpar (like Malt-TAB but with zero-based indexing)
|
||||
|
||||
>>> zpar_data = """
|
||||
... Pierre NNP 1 NMOD
|
||||
... Vinken NNP 7 SUB
|
||||
... , , 1 P
|
||||
... 61 CD 4 NMOD
|
||||
... years NNS 5 AMOD
|
||||
... old JJ 1 NMOD
|
||||
... , , 1 P
|
||||
... will MD -1 ROOT
|
||||
... join VB 7 VC
|
||||
... the DT 10 NMOD
|
||||
... board NN 8 OBJ
|
||||
... as IN 8 VMOD
|
||||
... a DT 14 NMOD
|
||||
... nonexecutive JJ 14 NMOD
|
||||
... director NN 11 PMOD
|
||||
... Nov. NNP 8 VMOD
|
||||
... 29 CD 15 NMOD
|
||||
... . . 7 P
|
||||
... """
|
||||
|
||||
>>> zdg = DependencyGraph(zpar_data, zero_based=True)
|
||||
>>> print(zdg.tree())
|
||||
(will
|
||||
(Vinken Pierre , (old (years 61)) ,)
|
||||
(join (board the) (as (director a nonexecutive)) (Nov. 29))
|
||||
.)
|
||||
|
||||
|
||||
Projective Dependency Parsing
|
||||
-----------------------------
|
||||
|
||||
>>> grammar = DependencyGrammar.fromstring("""
|
||||
... 'fell' -> 'price' | 'stock'
|
||||
... 'price' -> 'of' 'the'
|
||||
... 'of' -> 'stock'
|
||||
... 'stock' -> 'the'
|
||||
... """)
|
||||
>>> print(grammar)
|
||||
Dependency grammar with 5 productions
|
||||
'fell' -> 'price'
|
||||
'fell' -> 'stock'
|
||||
'price' -> 'of' 'the'
|
||||
'of' -> 'stock'
|
||||
'stock' -> 'the'
|
||||
|
||||
>>> dp = ProjectiveDependencyParser(grammar)
|
||||
>>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])):
|
||||
... print(t)
|
||||
(fell (price the (of (stock the))))
|
||||
(fell (price the of) (stock the))
|
||||
(fell (price the of the) stock)
|
||||
|
||||
Non-Projective Dependency Parsing
|
||||
---------------------------------
|
||||
|
||||
>>> grammar = DependencyGrammar.fromstring("""
|
||||
... 'taught' -> 'play' | 'man'
|
||||
... 'man' -> 'the'
|
||||
... 'play' -> 'golf' | 'dog' | 'to'
|
||||
... 'dog' -> 'his'
|
||||
... """)
|
||||
>>> print(grammar)
|
||||
Dependency grammar with 7 productions
|
||||
'taught' -> 'play'
|
||||
'taught' -> 'man'
|
||||
'man' -> 'the'
|
||||
'play' -> 'golf'
|
||||
'play' -> 'dog'
|
||||
'play' -> 'to'
|
||||
'dog' -> 'his'
|
||||
|
||||
>>> dp = NonprojectiveDependencyParser(grammar)
|
||||
>>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf'])
|
||||
|
||||
>>> print(g.root['word'])
|
||||
taught
|
||||
|
||||
>>> for _, node in sorted(g.nodes.items()):
|
||||
... if node['word'] is not None:
|
||||
... print('{address} {word}: {d}'.format(d=node['deps'][''], **node))
|
||||
1 the: []
|
||||
2 man: [1]
|
||||
3 taught: [2, 7]
|
||||
4 his: []
|
||||
5 dog: [4]
|
||||
6 to: []
|
||||
7 play: [5, 6, 8]
|
||||
8 golf: []
|
||||
|
||||
>>> print(g.tree())
|
||||
(taught (man the) (play (dog his) to golf))
|
||||
|
||||
Integration with MALT parser
|
||||
============================
|
||||
|
||||
In case the top relation is different from the default, we can set it. In case
|
||||
of MALT parser, it's set to `'null'`.
|
||||
|
||||
>>> dg_str = """1 I _ NN NN _ 2 nn _ _
|
||||
... 2 shot _ NN NN _ 0 null _ _
|
||||
... 3 an _ AT AT _ 2 dep _ _
|
||||
... 4 elephant _ NN NN _ 7 nn _ _
|
||||
... 5 in _ NN NN _ 7 nn _ _
|
||||
... 6 my _ NN NN _ 7 nn _ _
|
||||
... 7 pajamas _ NNS NNS _ 3 dobj _ _
|
||||
... """
|
||||
>>> dg = DependencyGraph(dg_str, top_relation_label='null')
|
||||
|
||||
>>> len(dg.nodes)
|
||||
8
|
||||
|
||||
>>> dg.root['word'], dg.root['address']
|
||||
('shot', 2)
|
||||
|
||||
>>> print(dg.to_conll(10))
|
||||
1 I _ NN NN _ 2 nn _ _
|
||||
2 shot _ NN NN _ 0 null _ _
|
||||
3 an _ AT AT _ 2 dep _ _
|
||||
4 elephant _ NN NN _ 7 nn _ _
|
||||
5 in _ NN NN _ 7 nn _ _
|
||||
6 my _ NN NN _ 7 nn _ _
|
||||
7 pajamas _ NNS NNS _ 3 dobj _ _
|
||||
@@ -0,0 +1,552 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==================
|
||||
Discourse Checking
|
||||
==================
|
||||
|
||||
>>> from nltk import *
|
||||
>>> from nltk.sem import logic
|
||||
>>> logic._counter._value = 0
|
||||
|
||||
Setup
|
||||
=====
|
||||
|
||||
>>> from nltk.test.childes_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
The NLTK discourse module makes it possible to test consistency and
|
||||
redundancy of simple discourses, using theorem-proving and
|
||||
model-building from `nltk.inference`.
|
||||
|
||||
The ``DiscourseTester`` constructor takes a list of sentences as a
|
||||
parameter.
|
||||
|
||||
>>> dt = DiscourseTester(['a boxer walks', 'every boxer chases a girl'])
|
||||
|
||||
The ``DiscourseTester`` parses each sentence into a list of logical
|
||||
forms. Once we have created ``DiscourseTester`` object, we can
|
||||
inspect various properties of the discourse. First off, we might want
|
||||
to double-check what sentences are currently stored as the discourse.
|
||||
|
||||
>>> dt.sentences()
|
||||
s0: a boxer walks
|
||||
s1: every boxer chases a girl
|
||||
|
||||
As you will see, each sentence receives an identifier `s`\ :subscript:`i`.
|
||||
We might also want to check what grammar the ``DiscourseTester`` is
|
||||
using (by default, ``book_grammars/discourse.fcfg``):
|
||||
|
||||
>>> dt.grammar()
|
||||
% start S
|
||||
# Grammar Rules
|
||||
S[SEM = <app(?subj,?vp)>] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp]
|
||||
NP[NUM=?n,SEM=<app(?det,?nom)> ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom]
|
||||
NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np]
|
||||
...
|
||||
|
||||
A different grammar can be invoked by using the optional ``gramfile``
|
||||
parameter when a ``DiscourseTester`` object is created.
|
||||
|
||||
Readings and Threads
|
||||
====================
|
||||
|
||||
Depending on
|
||||
the grammar used, we may find some sentences have more than one
|
||||
logical form. To check this, use the ``readings()`` method. Given a
|
||||
sentence identifier of the form `s`\ :subscript:`i`, each reading of
|
||||
that sentence is given an identifier `s`\ :sub:`i`-`r`\ :sub:`j`.
|
||||
|
||||
|
||||
>>> dt.readings()
|
||||
<BLANKLINE>
|
||||
s0 readings:
|
||||
<BLANKLINE>
|
||||
s0-r0: exists z1.(boxer(z1) & walk(z1))
|
||||
s0-r1: exists z1.(boxerdog(z1) & walk(z1))
|
||||
<BLANKLINE>
|
||||
s1 readings:
|
||||
<BLANKLINE>
|
||||
s1-r0: all z2.(boxer(z2) -> exists z3.(girl(z3) & chase(z2,z3)))
|
||||
s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
|
||||
|
||||
|
||||
In this case, the only source of ambiguity lies in the word *boxer*,
|
||||
which receives two translations: ``boxer`` and ``boxerdog``. The
|
||||
intention is that one of these corresponds to the ``person`` sense and
|
||||
one to the ``dog`` sense. In principle, we would also expect to see a
|
||||
quantifier scope ambiguity in ``s1``. However, the simple grammar we
|
||||
are using, namely `sem4.fcfg <sem4.fcfg>`_, doesn't support quantifier
|
||||
scope ambiguity.
|
||||
|
||||
We can also investigate the readings of a specific sentence:
|
||||
|
||||
>>> dt.readings('a boxer walks')
|
||||
The sentence 'a boxer walks' has these readings:
|
||||
exists x.(boxer(x) & walk(x))
|
||||
exists x.(boxerdog(x) & walk(x))
|
||||
|
||||
Given that each sentence is two-ways ambiguous, we potentially have
|
||||
four different discourse 'threads', taking all combinations of
|
||||
readings. To see these, specify the ``threaded=True`` parameter on
|
||||
the ``readings()`` method. Again, each thread is assigned an
|
||||
identifier of the form `d`\ :sub:`i`. Following the identifier is a
|
||||
list of the readings that constitute that thread.
|
||||
|
||||
>>> dt.readings(threaded=True)
|
||||
d0: ['s0-r0', 's1-r0']
|
||||
d1: ['s0-r0', 's1-r1']
|
||||
d2: ['s0-r1', 's1-r0']
|
||||
d3: ['s0-r1', 's1-r1']
|
||||
|
||||
Of course, this simple-minded approach doesn't scale: a discourse with, say, three
|
||||
sentences, each of which has 3 readings, will generate 27 different
|
||||
threads. It is an interesting exercise to consider how to manage
|
||||
discourse ambiguity more efficiently.
|
||||
|
||||
Checking Consistency
|
||||
====================
|
||||
|
||||
Now, we can check whether some or all of the discourse threads are
|
||||
consistent, using the ``models()`` method. With no parameter, this
|
||||
method will try to find a model for every discourse thread in the
|
||||
current discourse. However, we can also specify just one thread, say ``d1``.
|
||||
|
||||
>>> dt.models('d1')
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d1
|
||||
--------------------------------------------------------------------------------
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 2
|
||||
<BLANKLINE>
|
||||
c1 = 0.
|
||||
<BLANKLINE>
|
||||
f1(0) = 0.
|
||||
f1(1) = 0.
|
||||
<BLANKLINE>
|
||||
boxer(0).
|
||||
- boxer(1).
|
||||
<BLANKLINE>
|
||||
- boxerdog(0).
|
||||
- boxerdog(1).
|
||||
<BLANKLINE>
|
||||
- girl(0).
|
||||
- girl(1).
|
||||
<BLANKLINE>
|
||||
walk(0).
|
||||
- walk(1).
|
||||
<BLANKLINE>
|
||||
- chase(0,0).
|
||||
- chase(0,1).
|
||||
- chase(1,0).
|
||||
- chase(1,1).
|
||||
<BLANKLINE>
|
||||
Consistent discourse: d1 ['s0-r0', 's1-r1']:
|
||||
s0-r0: exists z1.(boxer(z1) & walk(z1))
|
||||
s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
|
||||
<BLANKLINE>
|
||||
|
||||
There are various formats for rendering **Mace4** models --- here,
|
||||
we have used the 'cooked' format (which is intended to be
|
||||
human-readable). There are a number of points to note.
|
||||
|
||||
#. The entities in the domain are all treated as non-negative
|
||||
integers. In this case, there are only two entities, ``0`` and
|
||||
``1``.
|
||||
|
||||
#. The ``-`` symbol indicates negation. So ``0`` is the only
|
||||
``boxerdog`` and the only thing that ``walk``\ s. Nothing is a
|
||||
``boxer``, or a ``girl`` or in the ``chase`` relation. Thus the
|
||||
universal sentence is vacuously true.
|
||||
|
||||
#. ``c1`` is an introduced constant that denotes ``0``.
|
||||
|
||||
#. ``f1`` is a Skolem function, but it plays no significant role in
|
||||
this model.
|
||||
|
||||
|
||||
We might want to now add another sentence to the discourse, and there
|
||||
is method ``add_sentence()`` for doing just this.
|
||||
|
||||
>>> dt.add_sentence('John is a boxer')
|
||||
>>> dt.sentences()
|
||||
s0: a boxer walks
|
||||
s1: every boxer chases a girl
|
||||
s2: John is a boxer
|
||||
|
||||
We can now test all the properties as before; here, we just show a
|
||||
couple of them.
|
||||
|
||||
>>> dt.readings()
|
||||
<BLANKLINE>
|
||||
s0 readings:
|
||||
<BLANKLINE>
|
||||
s0-r0: exists z1.(boxer(z1) & walk(z1))
|
||||
s0-r1: exists z1.(boxerdog(z1) & walk(z1))
|
||||
<BLANKLINE>
|
||||
s1 readings:
|
||||
<BLANKLINE>
|
||||
s1-r0: all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
|
||||
s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
|
||||
<BLANKLINE>
|
||||
s2 readings:
|
||||
<BLANKLINE>
|
||||
s2-r0: boxer(John)
|
||||
s2-r1: boxerdog(John)
|
||||
>>> dt.readings(threaded=True)
|
||||
d0: ['s0-r0', 's1-r0', 's2-r0']
|
||||
d1: ['s0-r0', 's1-r0', 's2-r1']
|
||||
d2: ['s0-r0', 's1-r1', 's2-r0']
|
||||
d3: ['s0-r0', 's1-r1', 's2-r1']
|
||||
d4: ['s0-r1', 's1-r0', 's2-r0']
|
||||
d5: ['s0-r1', 's1-r0', 's2-r1']
|
||||
d6: ['s0-r1', 's1-r1', 's2-r0']
|
||||
d7: ['s0-r1', 's1-r1', 's2-r1']
|
||||
|
||||
If you are interested in a particular thread, the ``expand_threads()``
|
||||
method will remind you of what readings it consists of:
|
||||
|
||||
>>> thread = dt.expand_threads('d1')
|
||||
>>> for rid, reading in thread:
|
||||
... print(rid, str(reading.normalize()))
|
||||
s0-r0 exists z1.(boxer(z1) & walk(z1))
|
||||
s1-r0 all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2)))
|
||||
s2-r1 boxerdog(John)
|
||||
|
||||
Suppose we have already defined a discourse, as follows:
|
||||
|
||||
>>> dt = DiscourseTester(['A student dances', 'Every student is a person'])
|
||||
|
||||
Now, when we add a new sentence, is it consistent with what we already
|
||||
have? The `` consistchk=True`` parameter of ``add_sentence()`` allows
|
||||
us to check:
|
||||
|
||||
>>> dt.add_sentence('No person dances', consistchk=True)
|
||||
Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']:
|
||||
s0-r0: exists z1.(student(z1) & dance(z1))
|
||||
s1-r0: all z1.(student(z1) -> person(z1))
|
||||
s2-r0: -exists z1.(person(z1) & dance(z1))
|
||||
<BLANKLINE>
|
||||
>>> dt.readings()
|
||||
<BLANKLINE>
|
||||
s0 readings:
|
||||
<BLANKLINE>
|
||||
s0-r0: exists z1.(student(z1) & dance(z1))
|
||||
<BLANKLINE>
|
||||
s1 readings:
|
||||
<BLANKLINE>
|
||||
s1-r0: all z1.(student(z1) -> person(z1))
|
||||
<BLANKLINE>
|
||||
s2 readings:
|
||||
<BLANKLINE>
|
||||
s2-r0: -exists z1.(person(z1) & dance(z1))
|
||||
|
||||
So let's retract the inconsistent sentence:
|
||||
|
||||
>>> dt.retract_sentence('No person dances', verbose=True)
|
||||
Current sentences are
|
||||
s0: A student dances
|
||||
s1: Every student is a person
|
||||
|
||||
We can now verify that result is consistent.
|
||||
|
||||
>>> dt.models()
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d0
|
||||
--------------------------------------------------------------------------------
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 2
|
||||
<BLANKLINE>
|
||||
c1 = 0.
|
||||
<BLANKLINE>
|
||||
dance(0).
|
||||
- dance(1).
|
||||
<BLANKLINE>
|
||||
person(0).
|
||||
- person(1).
|
||||
<BLANKLINE>
|
||||
student(0).
|
||||
- student(1).
|
||||
<BLANKLINE>
|
||||
Consistent discourse: d0 ['s0-r0', 's1-r0']:
|
||||
s0-r0: exists z1.(student(z1) & dance(z1))
|
||||
s1-r0: all z1.(student(z1) -> person(z1))
|
||||
<BLANKLINE>
|
||||
|
||||
Checking Informativity
|
||||
======================
|
||||
|
||||
Let's assume that we are still trying to extend the discourse *A
|
||||
student dances.* *Every student is a person.* We add a new sentence,
|
||||
but this time, we check whether it is informative with respect to what
|
||||
has gone before.
|
||||
|
||||
>>> dt.add_sentence('A person dances', informchk=True)
|
||||
Sentence 'A person dances' under reading 'exists x.(person(x) & dance(x))':
|
||||
Not informative relative to thread 'd0'
|
||||
|
||||
In fact, we are just checking whether the new sentence is entailed by
|
||||
the preceding discourse.
|
||||
|
||||
>>> dt.models()
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d0
|
||||
--------------------------------------------------------------------------------
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 2
|
||||
<BLANKLINE>
|
||||
c1 = 0.
|
||||
<BLANKLINE>
|
||||
c2 = 0.
|
||||
<BLANKLINE>
|
||||
dance(0).
|
||||
- dance(1).
|
||||
<BLANKLINE>
|
||||
person(0).
|
||||
- person(1).
|
||||
<BLANKLINE>
|
||||
student(0).
|
||||
- student(1).
|
||||
<BLANKLINE>
|
||||
Consistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']:
|
||||
s0-r0: exists z1.(student(z1) & dance(z1))
|
||||
s1-r0: all z1.(student(z1) -> person(z1))
|
||||
s2-r0: exists z1.(person(z1) & dance(z1))
|
||||
<BLANKLINE>
|
||||
|
||||
|
||||
|
||||
Adding Background Knowledge
|
||||
===========================
|
||||
|
||||
Let's build a new discourse, and look at the readings of the component sentences:
|
||||
|
||||
>>> dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks'])
|
||||
>>> dt.readings()
|
||||
<BLANKLINE>
|
||||
s0 readings:
|
||||
<BLANKLINE>
|
||||
s0-r0: boxer(Vincent)
|
||||
s0-r1: boxerdog(Vincent)
|
||||
<BLANKLINE>
|
||||
s1 readings:
|
||||
<BLANKLINE>
|
||||
s1-r0: boxer(Fido)
|
||||
s1-r1: boxerdog(Fido)
|
||||
<BLANKLINE>
|
||||
s2 readings:
|
||||
<BLANKLINE>
|
||||
s2-r0: married(Vincent)
|
||||
<BLANKLINE>
|
||||
s3 readings:
|
||||
<BLANKLINE>
|
||||
s3-r0: bark(Fido)
|
||||
|
||||
This gives us a lot of threads:
|
||||
|
||||
>>> dt.readings(threaded=True)
|
||||
d0: ['s0-r0', 's1-r0', 's2-r0', 's3-r0']
|
||||
d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0']
|
||||
d2: ['s0-r1', 's1-r0', 's2-r0', 's3-r0']
|
||||
d3: ['s0-r1', 's1-r1', 's2-r0', 's3-r0']
|
||||
|
||||
|
||||
We can eliminate some of the readings, and hence some of the threads,
|
||||
by adding background information.
|
||||
|
||||
>>> import nltk.data
|
||||
>>> bg = nltk.data.load('grammars/book_grammars/background.fol')
|
||||
>>> dt.add_background(bg)
|
||||
>>> dt.background()
|
||||
all x.(boxerdog(x) -> dog(x))
|
||||
all x.(boxer(x) -> person(x))
|
||||
all x.-(dog(x) & person(x))
|
||||
all x.(married(x) <-> exists y.marry(x,y))
|
||||
all x.(bark(x) -> dog(x))
|
||||
all x y.(marry(x,y) -> (person(x) & person(y)))
|
||||
-(Vincent = Mia)
|
||||
-(Vincent = Fido)
|
||||
-(Mia = Fido)
|
||||
|
||||
The background information allows us to reject three of the threads as
|
||||
inconsistent. To see what remains, use the ``filter=True`` parameter
|
||||
on ``readings()``.
|
||||
|
||||
>>> dt.readings(filter=True)
|
||||
d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0']
|
||||
|
||||
The ``models()`` method gives us more information about the surviving thread.
|
||||
|
||||
>>> dt.models()
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d0
|
||||
--------------------------------------------------------------------------------
|
||||
No model found!
|
||||
<BLANKLINE>
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d1
|
||||
--------------------------------------------------------------------------------
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 3
|
||||
<BLANKLINE>
|
||||
Fido = 0.
|
||||
<BLANKLINE>
|
||||
Mia = 1.
|
||||
<BLANKLINE>
|
||||
Vincent = 2.
|
||||
<BLANKLINE>
|
||||
f1(0) = 0.
|
||||
f1(1) = 0.
|
||||
f1(2) = 2.
|
||||
<BLANKLINE>
|
||||
bark(0).
|
||||
- bark(1).
|
||||
- bark(2).
|
||||
<BLANKLINE>
|
||||
- boxer(0).
|
||||
- boxer(1).
|
||||
boxer(2).
|
||||
<BLANKLINE>
|
||||
boxerdog(0).
|
||||
- boxerdog(1).
|
||||
- boxerdog(2).
|
||||
<BLANKLINE>
|
||||
dog(0).
|
||||
- dog(1).
|
||||
- dog(2).
|
||||
<BLANKLINE>
|
||||
- married(0).
|
||||
- married(1).
|
||||
married(2).
|
||||
<BLANKLINE>
|
||||
- person(0).
|
||||
- person(1).
|
||||
person(2).
|
||||
<BLANKLINE>
|
||||
- marry(0,0).
|
||||
- marry(0,1).
|
||||
- marry(0,2).
|
||||
- marry(1,0).
|
||||
- marry(1,1).
|
||||
- marry(1,2).
|
||||
- marry(2,0).
|
||||
- marry(2,1).
|
||||
marry(2,2).
|
||||
<BLANKLINE>
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d2
|
||||
--------------------------------------------------------------------------------
|
||||
No model found!
|
||||
<BLANKLINE>
|
||||
--------------------------------------------------------------------------------
|
||||
Model for Discourse Thread d3
|
||||
--------------------------------------------------------------------------------
|
||||
No model found!
|
||||
<BLANKLINE>
|
||||
Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0', 's3-r0']:
|
||||
s0-r0: boxer(Vincent)
|
||||
s1-r0: boxer(Fido)
|
||||
s2-r0: married(Vincent)
|
||||
s3-r0: bark(Fido)
|
||||
<BLANKLINE>
|
||||
Consistent discourse: d1 ['s0-r0', 's1-r1', 's2-r0', 's3-r0']:
|
||||
s0-r0: boxer(Vincent)
|
||||
s1-r1: boxerdog(Fido)
|
||||
s2-r0: married(Vincent)
|
||||
s3-r0: bark(Fido)
|
||||
<BLANKLINE>
|
||||
Inconsistent discourse: d2 ['s0-r1', 's1-r0', 's2-r0', 's3-r0']:
|
||||
s0-r1: boxerdog(Vincent)
|
||||
s1-r0: boxer(Fido)
|
||||
s2-r0: married(Vincent)
|
||||
s3-r0: bark(Fido)
|
||||
<BLANKLINE>
|
||||
Inconsistent discourse: d3 ['s0-r1', 's1-r1', 's2-r0', 's3-r0']:
|
||||
s0-r1: boxerdog(Vincent)
|
||||
s1-r1: boxerdog(Fido)
|
||||
s2-r0: married(Vincent)
|
||||
s3-r0: bark(Fido)
|
||||
<BLANKLINE>
|
||||
|
||||
|
||||
.. This will not be visible in the html output: create a tempdir to
|
||||
play in.
|
||||
>>> import tempfile, os
|
||||
>>> tempdir = tempfile.mkdtemp()
|
||||
>>> old_dir = os.path.abspath('.')
|
||||
>>> os.chdir(tempdir)
|
||||
|
||||
In order to play around with your own version of background knowledge,
|
||||
you might want to start off with a local copy of ``background.fol``:
|
||||
|
||||
>>> nltk.data.retrieve('grammars/book_grammars/background.fol')
|
||||
Retrieving 'nltk:grammars/book_grammars/background.fol', saving to 'background.fol'
|
||||
|
||||
After you have modified the file, the ``load_fol()`` function will parse
|
||||
the strings in the file into expressions of ``nltk.sem.logic``.
|
||||
|
||||
>>> from nltk.inference.discourse import load_fol
|
||||
>>> mybg = load_fol(open('background.fol').read())
|
||||
|
||||
The result can be loaded as an argument of ``add_background()`` in the
|
||||
manner shown earlier.
|
||||
|
||||
.. This will not be visible in the html output: clean up the tempdir.
|
||||
>>> os.chdir(old_dir)
|
||||
>>> for f in os.listdir(tempdir):
|
||||
... os.remove(os.path.join(tempdir, f))
|
||||
>>> os.rmdir(tempdir)
|
||||
>>> nltk.data.clear_cache()
|
||||
|
||||
|
||||
Regression Testing from book
|
||||
============================
|
||||
|
||||
>>> logic._counter._value = 0
|
||||
|
||||
>>> from nltk.tag import RegexpTagger
|
||||
>>> tagger = RegexpTagger(
|
||||
... [('^(chases|runs)$', 'VB'),
|
||||
... ('^(a)$', 'ex_quant'),
|
||||
... ('^(every)$', 'univ_quant'),
|
||||
... ('^(dog|boy)$', 'NN'),
|
||||
... ('^(He)$', 'PRP')
|
||||
... ])
|
||||
>>> rc = DrtGlueReadingCommand(depparser=MaltParser(tagger=tagger))
|
||||
>>> dt = DiscourseTester(map(str.split, ['Every dog chases a boy', 'He runs']), rc)
|
||||
>>> dt.readings()
|
||||
<BLANKLINE>
|
||||
s0 readings:
|
||||
<BLANKLINE>
|
||||
s0-r0: ([z2],[boy(z2), (([z5],[dog(z5)]) -> ([],[chases(z5,z2)]))])
|
||||
s0-r1: ([],[(([z1],[dog(z1)]) -> ([z2],[boy(z2), chases(z1,z2)]))])
|
||||
<BLANKLINE>
|
||||
s1 readings:
|
||||
<BLANKLINE>
|
||||
s1-r0: ([z1],[PRO(z1), runs(z1)])
|
||||
>>> dt.readings(show_thread_readings=True)
|
||||
d0: ['s0-r0', 's1-r0'] : ([z1,z2],[boy(z1), (([z3],[dog(z3)]) -> ([],[chases(z3,z1)])), (z2 = z1), runs(z2)])
|
||||
d1: ['s0-r1', 's1-r0'] : INVALID: AnaphoraResolutionException
|
||||
>>> dt.readings(filter=True, show_thread_readings=True)
|
||||
d0: ['s0-r0', 's1-r0'] : ([z1,z3],[boy(z1), (([z2],[dog(z2)]) -> ([],[chases(z2,z1)])), (z3 = z1), runs(z3)])
|
||||
|
||||
>>> logic._counter._value = 0
|
||||
|
||||
>>> from nltk.parse import FeatureEarleyChartParser
|
||||
>>> from nltk.sem.drt import DrtParser
|
||||
>>> grammar = nltk.data.load('grammars/book_grammars/drt.fcfg', logic_parser=DrtParser())
|
||||
>>> parser = FeatureEarleyChartParser(grammar, trace=0)
|
||||
>>> trees = parser.parse('Angus owns a dog'.split())
|
||||
>>> print(list(trees)[0].label()['SEM'].simplify().normalize())
|
||||
([z1,z2],[Angus(z1), dog(z2), own(z1,z2)])
|
||||
515
Backend/venv/lib/python3.12/site-packages/nltk/test/drt.doctest
Normal file
515
Backend/venv/lib/python3.12/site-packages/nltk/test/drt.doctest
Normal file
@@ -0,0 +1,515 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
================================
|
||||
Discourse Representation Theory
|
||||
================================
|
||||
|
||||
>>> from nltk.sem import logic
|
||||
>>> from nltk.inference import TableauProver
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
A DRS can be created with the ``DRS()`` constructor. This takes two arguments: a list of
|
||||
discourse referents and list of conditions. .
|
||||
|
||||
>>> from nltk.sem.drt import *
|
||||
>>> dexpr = DrtExpression.fromstring
|
||||
>>> man_x = dexpr('man(x)')
|
||||
>>> walk_x = dexpr('walk(x)')
|
||||
>>> x = dexpr('x')
|
||||
>>> print(DRS([x], [man_x, walk_x]))
|
||||
([x],[man(x), walk(x)])
|
||||
|
||||
The ``parse()`` method can also be applied directly to DRS
|
||||
expressions, which allows them to be specified more
|
||||
easily.
|
||||
|
||||
>>> drs1 = dexpr('([x],[man(x),walk(x)])')
|
||||
>>> print(drs1)
|
||||
([x],[man(x), walk(x)])
|
||||
|
||||
DRSs can be *merged* using the ``+`` operator.
|
||||
|
||||
>>> drs2 = dexpr('([y],[woman(y),stop(y)])')
|
||||
>>> drs3 = drs1 + drs2
|
||||
>>> print(drs3)
|
||||
(([x],[man(x), walk(x)]) + ([y],[woman(y), stop(y)]))
|
||||
>>> print(drs3.simplify())
|
||||
([x,y],[man(x), walk(x), woman(y), stop(y)])
|
||||
|
||||
We can embed DRSs as components of an ``implies`` condition.
|
||||
|
||||
>>> s = '([], [(%s -> %s)])' % (drs1, drs2)
|
||||
>>> print(dexpr(s))
|
||||
([],[(([x],[man(x), walk(x)]) -> ([y],[woman(y), stop(y)]))])
|
||||
|
||||
The ``fol()`` method converts DRSs into FOL formulae.
|
||||
|
||||
>>> print(dexpr(r'([x],[man(x), walks(x)])').fol())
|
||||
exists x.(man(x) & walks(x))
|
||||
>>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol())
|
||||
all x.(man(x) -> walks(x))
|
||||
|
||||
In order to visualize a DRS, the ``pretty_format()`` method can be used.
|
||||
|
||||
>>> print(drs3.pretty_format())
|
||||
_________ __________
|
||||
| x | | y |
|
||||
(|---------| + |----------|)
|
||||
| man(x) | | woman(y) |
|
||||
| walk(x) | | stop(y) |
|
||||
|_________| |__________|
|
||||
|
||||
|
||||
Parse to semantics
|
||||
------------------
|
||||
|
||||
..
|
||||
>>> logic._counter._value = 0
|
||||
|
||||
DRSs can be used for building compositional semantics in a feature
|
||||
based grammar. To specify that we want to use DRSs, the appropriate
|
||||
logic parser needs be passed as a parameter to ``load_earley()``
|
||||
|
||||
>>> from nltk.parse import load_parser
|
||||
>>> from nltk.sem.drt import DrtParser
|
||||
>>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, logic_parser=DrtParser())
|
||||
>>> for tree in parser.parse('a dog barks'.split()):
|
||||
... print(tree.label()['SEM'].simplify())
|
||||
...
|
||||
([x],[dog(x), bark(x)])
|
||||
|
||||
Alternatively, a ``FeatStructReader`` can be passed with the ``logic_parser`` set on it
|
||||
|
||||
>>> from nltk.featstruct import FeatStructReader
|
||||
>>> from nltk.grammar import FeatStructNonterminal
|
||||
>>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, fstruct_reader=FeatStructReader(fdict_class=FeatStructNonterminal, logic_parser=DrtParser()))
|
||||
>>> for tree in parser.parse('every girl chases a dog'.split()):
|
||||
... print(tree.label()['SEM'].simplify().normalize())
|
||||
...
|
||||
([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chase(z1,z2)]))])
|
||||
|
||||
|
||||
|
||||
Unit Tests
|
||||
==========
|
||||
|
||||
Parser
|
||||
------
|
||||
|
||||
>>> print(dexpr(r'([x,y],[sees(x,y)])'))
|
||||
([x,y],[sees(x,y)])
|
||||
>>> print(dexpr(r'([x],[man(x), walks(x)])'))
|
||||
([x],[man(x), walks(x)])
|
||||
>>> print(dexpr(r'\x.([],[man(x), walks(x)])'))
|
||||
\x.([],[man(x), walks(x)])
|
||||
>>> print(dexpr(r'\x.\y.([],[sees(x,y)])'))
|
||||
\x y.([],[sees(x,y)])
|
||||
|
||||
>>> print(dexpr(r'([x,y],[(x = y)])'))
|
||||
([x,y],[(x = y)])
|
||||
>>> print(dexpr(r'([x,y],[(x != y)])'))
|
||||
([x,y],[-(x = y)])
|
||||
|
||||
>>> print(dexpr(r'\x.([],[walks(x)])(john)'))
|
||||
(\x.([],[walks(x)]))(john)
|
||||
>>> print(dexpr(r'\R.\x.([],[big(x,R)])(\y.([],[mouse(y)]))'))
|
||||
(\R x.([],[big(x,R)]))(\y.([],[mouse(y)]))
|
||||
|
||||
>>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))'))
|
||||
(([x],[walks(x)]) + ([y],[runs(y)]))
|
||||
>>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))'))
|
||||
(([x,y],[walks(x), jumps(y)]) + ([z],[twos(z)]) + ([w],[runs(w)]))
|
||||
>>> print(dexpr(r'((([],[walks(x)]) + ([],[twos(x)])) + ([],[runs(x)]))'))
|
||||
(([],[walks(x)]) + ([],[twos(x)]) + ([],[runs(x)]))
|
||||
>>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)])) + (([],[threes(x)]) + ([],[fours(x)])))'))
|
||||
(([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)]))
|
||||
|
||||
>>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))'))
|
||||
(([],[walks(x)]) -> ([],[runs(x)]))
|
||||
|
||||
>>> print(dexpr(r'([x],[PRO(x), sees(John,x)])'))
|
||||
([x],[PRO(x), sees(John,x)])
|
||||
>>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])'))
|
||||
([x],[man(x), -([],[walks(x)])])
|
||||
>>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])'))
|
||||
([],[(([x],[man(x)]) -> ([],[walks(x)]))])
|
||||
|
||||
>>> print(dexpr(r'DRS([x],[walk(x)])'))
|
||||
([x],[walk(x)])
|
||||
>>> print(dexpr(r'DRS([x][walk(x)])'))
|
||||
([x],[walk(x)])
|
||||
>>> print(dexpr(r'([x][walk(x)])'))
|
||||
([x],[walk(x)])
|
||||
|
||||
``simplify()``
|
||||
--------------
|
||||
|
||||
>>> print(dexpr(r'\x.([],[man(x), walks(x)])(john)').simplify())
|
||||
([],[man(john), walks(john)])
|
||||
>>> print(dexpr(r'\x.\y.([z],[dog(z),sees(x,y)])(john)(mary)').simplify())
|
||||
([z],[dog(z), sees(john,mary)])
|
||||
>>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').simplify())
|
||||
\x.([],[big(x,\y.([],[mouse(y)]))])
|
||||
|
||||
>>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').simplify())
|
||||
([x,y],[walks(x), runs(y)])
|
||||
>>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))').simplify())
|
||||
([w,x,y,z],[walks(x), jumps(y), twos(z), runs(w)])
|
||||
>>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])))').simplify())
|
||||
([],[walks(x), runs(x), threes(x), fours(x)])
|
||||
>>> dexpr(r'([x],[man(x)])+([x],[walks(x)])').simplify() == \
|
||||
... dexpr(r'([x,z1],[man(x), walks(z1)])')
|
||||
True
|
||||
>>> dexpr(r'([y],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)]))])+([x],[run(x)])').simplify() == \
|
||||
... dexpr(r'([y,z1],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)])), run(z1)])')
|
||||
True
|
||||
|
||||
>>> dexpr(r'\Q.(([x],[john(x),walks(x)]) + Q)(([x],[PRO(x),leaves(x)]))').simplify() == \
|
||||
... dexpr(r'([x,z1],[john(x), walks(x), PRO(z1), leaves(z1)])')
|
||||
True
|
||||
|
||||
>>> logic._counter._value = 0
|
||||
>>> print(dexpr('([],[(([x],[dog(x)]) -> ([e,y],[boy(y), chase(e), subj(e,x), obj(e,y)]))])+([e,x],[PRO(x), run(e), subj(e,x)])').simplify().normalize().normalize())
|
||||
([e02,z5],[(([z3],[dog(z3)]) -> ([e01,z4],[boy(z4), chase(e01), subj(e01,z3), obj(e01,z4)])), PRO(z5), run(e02), subj(e02,z5)])
|
||||
|
||||
``fol()``
|
||||
-----------
|
||||
|
||||
>>> print(dexpr(r'([x,y],[sees(x,y)])').fol())
|
||||
exists x y.sees(x,y)
|
||||
>>> print(dexpr(r'([x],[man(x), walks(x)])').fol())
|
||||
exists x.(man(x) & walks(x))
|
||||
>>> print(dexpr(r'\x.([],[man(x), walks(x)])').fol())
|
||||
\x.(man(x) & walks(x))
|
||||
>>> print(dexpr(r'\x y.([],[sees(x,y)])').fol())
|
||||
\x y.sees(x,y)
|
||||
|
||||
>>> print(dexpr(r'\x.([],[walks(x)])(john)').fol())
|
||||
\x.walks(x)(john)
|
||||
>>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').fol())
|
||||
(\R x.big(x,R))(\y.mouse(y))
|
||||
|
||||
>>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').fol())
|
||||
(exists x.walks(x) & exists y.runs(y))
|
||||
|
||||
>>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))').fol())
|
||||
(walks(x) -> runs(x))
|
||||
|
||||
>>> print(dexpr(r'([x],[PRO(x), sees(John,x)])').fol())
|
||||
exists x.(PRO(x) & sees(John,x))
|
||||
>>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])').fol())
|
||||
exists x.(man(x) & -walks(x))
|
||||
>>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol())
|
||||
all x.(man(x) -> walks(x))
|
||||
|
||||
>>> print(dexpr(r'([x],[man(x) | walks(x)])').fol())
|
||||
exists x.(man(x) | walks(x))
|
||||
>>> print(dexpr(r'P(x) + ([x],[walks(x)])').fol())
|
||||
(P(x) & exists x.walks(x))
|
||||
|
||||
``resolve_anaphora()``
|
||||
----------------------
|
||||
|
||||
>>> from nltk.sem.drt import AnaphoraResolutionException
|
||||
|
||||
>>> print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])')))
|
||||
([x,y,z],[dog(x), cat(y), walks(z), (z = [x,y])])
|
||||
>>> print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])')))
|
||||
([],[(([x],[dog(x)]) -> ([y],[walks(y), (y = x)]))])
|
||||
>>> print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')).simplify())
|
||||
([x,y],[(x = y)])
|
||||
>>> try: print(resolve_anaphora(dexpr(r'([x],[walks(x), PRO(x)])')))
|
||||
... except AnaphoraResolutionException as e: print(e)
|
||||
Variable 'x' does not resolve to anything.
|
||||
>>> print(resolve_anaphora(dexpr('([e01,z6,z7],[boy(z6), PRO(z7), run(e01), subj(e01,z7)])')))
|
||||
([e01,z6,z7],[boy(z6), (z7 = z6), run(e01), subj(e01,z7)])
|
||||
|
||||
``equiv()``:
|
||||
----------------
|
||||
|
||||
>>> a = dexpr(r'([x],[man(x), walks(x)])')
|
||||
>>> b = dexpr(r'([x],[walks(x), man(x)])')
|
||||
>>> print(a.equiv(b, TableauProver()))
|
||||
True
|
||||
|
||||
|
||||
``replace()``:
|
||||
--------------
|
||||
|
||||
>>> a = dexpr(r'a')
|
||||
>>> w = dexpr(r'w')
|
||||
>>> x = dexpr(r'x')
|
||||
>>> y = dexpr(r'y')
|
||||
>>> z = dexpr(r'z')
|
||||
|
||||
|
||||
replace bound
|
||||
-------------
|
||||
|
||||
>>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, False))
|
||||
([x],[give(x,y,z)])
|
||||
>>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, True))
|
||||
([a],[give(a,y,z)])
|
||||
|
||||
replace unbound
|
||||
---------------
|
||||
|
||||
>>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, False))
|
||||
([x],[give(x,a,z)])
|
||||
>>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, True))
|
||||
([x],[give(x,a,z)])
|
||||
|
||||
replace unbound with bound
|
||||
--------------------------
|
||||
|
||||
>>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, False) == \
|
||||
... dexpr('([z1],[give(z1,x,z)])')
|
||||
True
|
||||
>>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, True) == \
|
||||
... dexpr('([z1],[give(z1,x,z)])')
|
||||
True
|
||||
|
||||
replace unbound with unbound
|
||||
----------------------------
|
||||
|
||||
>>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, False))
|
||||
([x],[give(x,z,z)])
|
||||
>>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, True))
|
||||
([x],[give(x,z,z)])
|
||||
|
||||
|
||||
replace unbound
|
||||
---------------
|
||||
|
||||
>>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False))
|
||||
(([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
|
||||
>>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True))
|
||||
(([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
|
||||
|
||||
replace bound
|
||||
-------------
|
||||
|
||||
>>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, False))
|
||||
(([x],[P(x,y,z)]) + ([y],[Q(x,y,z)]))
|
||||
>>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, True))
|
||||
(([a],[P(a,y,z)]) + ([y],[Q(a,y,z)]))
|
||||
|
||||
replace unbound with unbound
|
||||
----------------------------
|
||||
|
||||
>>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False))
|
||||
(([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
|
||||
>>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True))
|
||||
(([x],[P(x,y,a)]) + ([y],[Q(x,y,a)]))
|
||||
|
||||
replace unbound with bound on same side
|
||||
---------------------------------------
|
||||
|
||||
>>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, False) == \
|
||||
... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))')
|
||||
True
|
||||
>>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, True) == \
|
||||
... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))')
|
||||
True
|
||||
|
||||
replace unbound with bound on other side
|
||||
----------------------------------------
|
||||
|
||||
>>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, False) == \
|
||||
... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))')
|
||||
True
|
||||
>>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, True) == \
|
||||
... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))')
|
||||
True
|
||||
|
||||
replace unbound with double bound
|
||||
---------------------------------
|
||||
|
||||
>>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, False) == \
|
||||
... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))')
|
||||
True
|
||||
>>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, True) == \
|
||||
... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))')
|
||||
True
|
||||
|
||||
|
||||
regression tests
|
||||
----------------
|
||||
|
||||
>>> d = dexpr('([x],[A(c), ([y],[B(x,y,z,a)])->([z],[C(x,y,z,a)])])')
|
||||
>>> print(d)
|
||||
([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
|
||||
>>> print(d.pretty_format())
|
||||
____________________________________
|
||||
| x |
|
||||
|------------------------------------|
|
||||
| A(c) |
|
||||
| ____________ ____________ |
|
||||
| | y | | z | |
|
||||
| (|------------| -> |------------|) |
|
||||
| | B(x,y,z,a) | | C(x,y,z,a) | |
|
||||
| |____________| |____________| |
|
||||
|____________________________________|
|
||||
>>> print(str(d))
|
||||
([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
|
||||
>>> print(d.fol())
|
||||
exists x.(A(c) & all y.(B(x,y,z,a) -> exists z.C(x,y,z,a)))
|
||||
>>> print(d.replace(Variable('a'), DrtVariableExpression(Variable('r'))))
|
||||
([x],[A(c), (([y],[B(x,y,z,r)]) -> ([z],[C(x,y,z,r)]))])
|
||||
>>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r'))))
|
||||
([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
|
||||
>>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r'))))
|
||||
([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))])
|
||||
>>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r'))))
|
||||
([x],[A(c), (([y],[B(x,y,r,a)]) -> ([z],[C(x,y,z,a)]))])
|
||||
>>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')), True))
|
||||
([r],[A(c), (([y],[B(r,y,z,a)]) -> ([z],[C(r,y,z,a)]))])
|
||||
>>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')), True))
|
||||
([x],[A(c), (([r],[B(x,r,z,a)]) -> ([z],[C(x,r,z,a)]))])
|
||||
>>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')), True))
|
||||
([x],[A(c), (([y],[B(x,y,r,a)]) -> ([r],[C(x,y,r,a)]))])
|
||||
>>> print(d == dexpr('([l],[A(c), ([m],[B(l,m,z,a)])->([n],[C(l,m,n,a)])])'))
|
||||
True
|
||||
>>> d = dexpr('([],[([x,y],[B(x,y,h), ([a,b],[dee(x,a,g)])])->([z,w],[cee(x,y,f), ([c,d],[E(x,c,d,e)])])])')
|
||||
>>> sorted(d.free())
|
||||
[Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')]
|
||||
>>> sorted(d.variables())
|
||||
[Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')]
|
||||
>>> sorted(d.get_refs(True))
|
||||
[Variable('a'), Variable('b'), Variable('c'), Variable('d'), Variable('w'), Variable('x'), Variable('y'), Variable('z')]
|
||||
>>> sorted(d.conds[0].get_refs(False))
|
||||
[Variable('x'), Variable('y')]
|
||||
>>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])->([],[C(x,y)]), ([x,y],[D(x,y)])->([],[E(x,y)]), ([],[F(x,y)])->([x,y],[G(x,y)])])').eliminate_equality())
|
||||
([x],[A(x,x), (([],[B(x,x)]) -> ([],[C(x,x)])), (([x,y],[D(x,y)]) -> ([],[E(x,y)])), (([],[F(x,x)]) -> ([x,y],[G(x,y)]))])
|
||||
>>> print(dexpr('([x,y],[A(x,y), (x=y)]) -> ([],[B(x,y)])').eliminate_equality())
|
||||
(([x],[A(x,x)]) -> ([],[B(x,x)]))
|
||||
>>> print(dexpr('([x,y],[A(x,y)]) -> ([],[B(x,y), (x=y)])').eliminate_equality())
|
||||
(([x,y],[A(x,y)]) -> ([],[B(x,x)]))
|
||||
>>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])])').eliminate_equality())
|
||||
([x],[A(x,x), ([],[B(x,x)])])
|
||||
>>> print(dexpr('([x,y],[A(x,y), ([],[B(x,y), (x=y)])])').eliminate_equality())
|
||||
([x,y],[A(x,y), ([],[B(x,x)])])
|
||||
>>> print(dexpr('([z8 z9 z10],[A(z8), z8=z10, z9=z10, B(z9), C(z10), D(z10)])').eliminate_equality())
|
||||
([z9],[A(z9), B(z9), C(z9), D(z9)])
|
||||
|
||||
>>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)]), ([x,y],[C(x,y)])])').eliminate_equality())
|
||||
([x],[A(x,x), ([],[B(x,x)]), ([x,y],[C(x,y)])])
|
||||
>>> print(dexpr('([x,y],[A(x,y)]) + ([],[B(x,y), (x=y)]) + ([],[C(x,y)])').eliminate_equality())
|
||||
([x],[A(x,x), B(x,x), C(x,x)])
|
||||
>>> print(dexpr('([x,y],[B(x,y)])+([x,y],[C(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))))
|
||||
(([x,y],[B(x,y)]) + ([x,y],[C(x,y)]))
|
||||
>>> print(dexpr('(([x,y],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))))
|
||||
(([x,y],[B(x,y)]) + ([],[C(x,y)]) + ([],[D(x,y)]))
|
||||
>>> print(dexpr('(([],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))))
|
||||
(([],[B(x,x)]) + ([],[C(x,x)]) + ([],[D(x,x)]))
|
||||
>>> print(dexpr('(([],[B(x,y), ([x,y],[A(x,y)])])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))).normalize())
|
||||
(([],[B(z3,z1), ([z2,z3],[A(z3,z2)])]) + ([],[C(z3,z1)]) + ([],[D(z3,z1)]))
|
||||
|
||||
|
||||
Parse errors
|
||||
============
|
||||
|
||||
>>> def parse_error(drtstring):
|
||||
... try: dexpr(drtstring)
|
||||
... except logic.LogicalExpressionException as e: print(e)
|
||||
|
||||
>>> parse_error(r'')
|
||||
End of input found. Expression expected.
|
||||
<BLANKLINE>
|
||||
^
|
||||
>>> parse_error(r'(')
|
||||
End of input found. Expression expected.
|
||||
(
|
||||
^
|
||||
>>> parse_error(r'()')
|
||||
Unexpected token: ')'. Expression expected.
|
||||
()
|
||||
^
|
||||
>>> parse_error(r'([')
|
||||
End of input found. Expected token ']'.
|
||||
([
|
||||
^
|
||||
>>> parse_error(r'([,')
|
||||
',' is an illegal variable name. Constants may not be quantified.
|
||||
([,
|
||||
^
|
||||
>>> parse_error(r'([x,')
|
||||
End of input found. Variable expected.
|
||||
([x,
|
||||
^
|
||||
>>> parse_error(r'([]')
|
||||
End of input found. Expected token '['.
|
||||
([]
|
||||
^
|
||||
>>> parse_error(r'([][')
|
||||
End of input found. Expected token ']'.
|
||||
([][
|
||||
^
|
||||
>>> parse_error(r'([][,')
|
||||
Unexpected token: ','. Expression expected.
|
||||
([][,
|
||||
^
|
||||
>>> parse_error(r'([][]')
|
||||
End of input found. Expected token ')'.
|
||||
([][]
|
||||
^
|
||||
>>> parse_error(r'([x][man(x)]) |')
|
||||
End of input found. Expression expected.
|
||||
([x][man(x)]) |
|
||||
^
|
||||
|
||||
Pretty Printing
|
||||
===============
|
||||
|
||||
>>> dexpr(r"([],[])").pretty_print()
|
||||
__
|
||||
| |
|
||||
|--|
|
||||
|__|
|
||||
|
||||
>>> dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print()
|
||||
_____________________________
|
||||
| |
|
||||
|-----------------------------|
|
||||
| ________ _________ |
|
||||
| | x | | | |
|
||||
| (|--------| -> |---------|) |
|
||||
| | big(x) | | bark(x) | |
|
||||
| | dog(x) | |_________| |
|
||||
| |________| |
|
||||
| _________ |
|
||||
| | x | |
|
||||
| __ |---------| |
|
||||
| | | walk(x) | |
|
||||
| |_________| |
|
||||
|_____________________________|
|
||||
|
||||
>>> dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print()
|
||||
_________ _________
|
||||
| x y | | z |
|
||||
(|---------| + |---------|)
|
||||
| (x = y) | | dog(z) |
|
||||
|_________| | walk(z) |
|
||||
|_________|
|
||||
|
||||
>>> dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print()
|
||||
_______________________________
|
||||
| |
|
||||
|-------------------------------|
|
||||
| ___ ___ _________ |
|
||||
| | x | | y | | z | |
|
||||
| (|---| | |---| | |---------|) |
|
||||
| |___| |___| | dog(z) | |
|
||||
| | walk(z) | |
|
||||
| |_________| |
|
||||
|_______________________________|
|
||||
|
||||
>>> dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print()
|
||||
___ ________
|
||||
\ | x | \ | |
|
||||
/\ P Q.(|---| + P(x) + Q(x))( /\ x.|--------|)
|
||||
|___| | dog(x) |
|
||||
|________|
|
||||
@@ -0,0 +1,610 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=========================
|
||||
Feature Grammar Parsing
|
||||
=========================
|
||||
|
||||
.. definitions from nltk_book/definitions.rst
|
||||
|
||||
.. role:: feat
|
||||
:class: feature
|
||||
.. role:: fval
|
||||
:class: fval
|
||||
.. |rarr| unicode:: U+2192 .. right arrow
|
||||
.. |dot| unicode:: U+2022 .. bullet
|
||||
.. |pi| unicode:: U+03C0
|
||||
|
||||
Grammars can be parsed from strings.
|
||||
|
||||
>>> import nltk
|
||||
>>> from nltk import grammar, parse
|
||||
>>> g = """
|
||||
... % start DP
|
||||
... DP[AGR=?a] -> D[AGR=?a] N[AGR=?a]
|
||||
... D[AGR=[NUM='sg', PERS=3]] -> 'this' | 'that'
|
||||
... D[AGR=[NUM='pl', PERS=3]] -> 'these' | 'those'
|
||||
... D[AGR=[NUM='pl', PERS=1]] -> 'we'
|
||||
... D[AGR=[PERS=2]] -> 'you'
|
||||
... N[AGR=[NUM='sg', GND='m']] -> 'boy'
|
||||
... N[AGR=[NUM='pl', GND='m']] -> 'boys'
|
||||
... N[AGR=[NUM='sg', GND='f']] -> 'girl'
|
||||
... N[AGR=[NUM='pl', GND='f']] -> 'girls'
|
||||
... N[AGR=[NUM='sg']] -> 'student'
|
||||
... N[AGR=[NUM='pl']] -> 'students'
|
||||
... """
|
||||
>>> grammar = grammar.FeatureGrammar.fromstring(g)
|
||||
>>> tokens = 'these girls'.split()
|
||||
>>> parser = parse.FeatureEarleyChartParser(grammar)
|
||||
>>> trees = parser.parse(tokens)
|
||||
>>> for tree in trees: print(tree)
|
||||
(DP[AGR=[GND='f', NUM='pl', PERS=3]]
|
||||
(D[AGR=[NUM='pl', PERS=3]] these)
|
||||
(N[AGR=[GND='f', NUM='pl']] girls))
|
||||
|
||||
In general, when we are trying to develop even a very small grammar,
|
||||
it is convenient to put the rules in a file where they can be edited,
|
||||
tested and revised. Let's assume that we have saved feat0cfg as a file named
|
||||
``'feat0.fcfg'`` and placed it in the NLTK ``data`` directory. We can
|
||||
inspect it as follows:
|
||||
|
||||
>>> nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg')
|
||||
% start S
|
||||
# ###################
|
||||
# Grammar Productions
|
||||
# ###################
|
||||
# S expansion productions
|
||||
S -> NP[NUM=?n] VP[NUM=?n]
|
||||
# NP expansion productions
|
||||
NP[NUM=?n] -> N[NUM=?n]
|
||||
NP[NUM=?n] -> PropN[NUM=?n]
|
||||
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
|
||||
NP[NUM=pl] -> N[NUM=pl]
|
||||
# VP expansion productions
|
||||
VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
|
||||
VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
|
||||
# ###################
|
||||
# Lexical Productions
|
||||
# ###################
|
||||
Det[NUM=sg] -> 'this' | 'every'
|
||||
Det[NUM=pl] -> 'these' | 'all'
|
||||
Det -> 'the' | 'some' | 'several'
|
||||
PropN[NUM=sg]-> 'Kim' | 'Jody'
|
||||
N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
|
||||
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children'
|
||||
IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks'
|
||||
TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
|
||||
IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk'
|
||||
TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
|
||||
IV[TENSE=past] -> 'disappeared' | 'walked'
|
||||
TV[TENSE=past] -> 'saw' | 'liked'
|
||||
|
||||
Assuming we have saved feat0cfg as a file named
|
||||
``'feat0.fcfg'``, the function ``parse.load_parser`` allows us to
|
||||
read the grammar into NLTK, ready for use in parsing.
|
||||
|
||||
|
||||
>>> cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1)
|
||||
>>> sent = 'Kim likes children'
|
||||
>>> tokens = sent.split()
|
||||
>>> tokens
|
||||
['Kim', 'likes', 'children']
|
||||
>>> trees = cp.parse(tokens)
|
||||
|.Kim .like.chil.|
|
||||
|[----] . .| [0:1] 'Kim'
|
||||
|. [----] .| [1:2] 'likes'
|
||||
|. . [----]| [2:3] 'children'
|
||||
|[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' *
|
||||
|[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
|
||||
|[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
|
||||
|. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' *
|
||||
|. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
|
||||
|. . [----]| [2:3] N[NUM='pl'] -> 'children' *
|
||||
|. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] *
|
||||
|. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'}
|
||||
|. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] *
|
||||
|[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] *
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[]
|
||||
(NP[NUM='sg'] (PropN[NUM='sg'] Kim))
|
||||
(VP[NUM='sg', TENSE='pres']
|
||||
(TV[NUM='sg', TENSE='pres'] likes)
|
||||
(NP[NUM='pl'] (N[NUM='pl'] children))))
|
||||
|
||||
The parser works directly with
|
||||
the underspecified productions given by the grammar. That is, the
|
||||
Predictor rule does not attempt to compile out all admissible feature
|
||||
combinations before trying to expand the non-terminals on the left hand
|
||||
side of a production. However, when the Scanner matches an input word
|
||||
against a lexical production that has been predicted, the new edge will
|
||||
typically contain fully specified features; e.g., the edge
|
||||
[PropN[`num`:feat: = `sg`:fval:] |rarr| 'Kim', (0, 1)]. Recall from
|
||||
Chapter 8 that the Fundamental (or Completer) Rule in
|
||||
standard CFGs is used to combine an incomplete edge that's expecting a
|
||||
nonterminal *B* with a following, complete edge whose left hand side
|
||||
matches *B*. In our current setting, rather than checking for a
|
||||
complete match, we test whether the expected category *B* will
|
||||
unify with the left hand side *B'* of a following complete
|
||||
edge. We will explain in more detail in Section 9.2 how
|
||||
unification works; for the moment, it is enough to know that as a
|
||||
result of unification, any variable values of features in *B* will be
|
||||
instantiated by constant values in the corresponding feature structure
|
||||
in *B'*, and these instantiated values will be used in the new edge
|
||||
added by the Completer. This instantiation can be seen, for example,
|
||||
in the edge
|
||||
[NP [`num`:feat:\ =\ `sg`:fval:] |rarr| PropN[`num`:feat:\ =\ `sg`:fval:] |dot|, (0, 1)]
|
||||
in Example 9.2, where the feature `num`:feat: has been assigned the value `sg`:fval:.
|
||||
|
||||
Feature structures in NLTK are ... Atomic feature values can be strings or
|
||||
integers.
|
||||
|
||||
>>> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')
|
||||
>>> print(fs1)
|
||||
[ NUM = 'sg' ]
|
||||
[ TENSE = 'past' ]
|
||||
|
||||
We can think of a feature structure as being like a Python dictionary,
|
||||
and access its values by indexing in the usual way.
|
||||
|
||||
>>> fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem')
|
||||
>>> print(fs1['GND'])
|
||||
fem
|
||||
|
||||
We can also define feature structures which have complex values, as
|
||||
discussed earlier.
|
||||
|
||||
>>> fs2 = nltk.FeatStruct(POS='N', AGR=fs1)
|
||||
>>> print(fs2)
|
||||
[ [ GND = 'fem' ] ]
|
||||
[ AGR = [ NUM = 'pl' ] ]
|
||||
[ [ PER = 3 ] ]
|
||||
[ ]
|
||||
[ POS = 'N' ]
|
||||
>>> print(fs2['AGR'])
|
||||
[ GND = 'fem' ]
|
||||
[ NUM = 'pl' ]
|
||||
[ PER = 3 ]
|
||||
>>> print(fs2['AGR']['PER'])
|
||||
3
|
||||
|
||||
Feature structures can also be constructed using the ``parse()``
|
||||
method of the ``nltk.FeatStruct`` class. Note that in this case, atomic
|
||||
feature values do not need to be enclosed in quotes.
|
||||
|
||||
>>> f1 = nltk.FeatStruct("[NUMBER = sg]")
|
||||
>>> f2 = nltk.FeatStruct("[PERSON = 3]")
|
||||
>>> print(nltk.unify(f1, f2))
|
||||
[ NUMBER = 'sg' ]
|
||||
[ PERSON = 3 ]
|
||||
|
||||
>>> f1 = nltk.FeatStruct("[A = [B = b, D = d]]")
|
||||
>>> f2 = nltk.FeatStruct("[A = [C = c, D = d]]")
|
||||
>>> print(nltk.unify(f1, f2))
|
||||
[ [ B = 'b' ] ]
|
||||
[ A = [ C = 'c' ] ]
|
||||
[ [ D = 'd' ] ]
|
||||
|
||||
|
||||
Feature Structures as Graphs
|
||||
----------------------------
|
||||
|
||||
Feature structures are not inherently tied to linguistic objects; they are
|
||||
general purpose structures for representing knowledge. For example, we
|
||||
could encode information about a person in a feature structure:
|
||||
|
||||
>>> person01 = nltk.FeatStruct("[NAME=Lee, TELNO='01 27 86 42 96',AGE=33]")
|
||||
>>> print(person01)
|
||||
[ AGE = 33 ]
|
||||
[ NAME = 'Lee' ]
|
||||
[ TELNO = '01 27 86 42 96' ]
|
||||
|
||||
There are a number of notations for representing reentrancy in
|
||||
matrix-style representations of feature structures. In NLTK, we adopt
|
||||
the following convention: the first occurrence of a shared feature structure
|
||||
is prefixed with an integer in parentheses, such as ``(1)``, and any
|
||||
subsequent reference to that structure uses the notation
|
||||
``->(1)``, as shown below.
|
||||
|
||||
|
||||
>>> fs = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
|
||||
... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""")
|
||||
>>> print(fs)
|
||||
[ ADDRESS = (1) [ NUMBER = 74 ] ]
|
||||
[ [ STREET = 'rue Pascal' ] ]
|
||||
[ ]
|
||||
[ NAME = 'Lee' ]
|
||||
[ ]
|
||||
[ SPOUSE = [ ADDRESS -> (1) ] ]
|
||||
[ [ NAME = 'Kim' ] ]
|
||||
|
||||
There can be any number of tags within a single feature structure.
|
||||
|
||||
>>> fs3 = nltk.FeatStruct("[A=(1)[B=b], C=(2)[], D->(1), E->(2)]")
|
||||
>>> print(fs3)
|
||||
[ A = (1) [ B = 'b' ] ]
|
||||
[ ]
|
||||
[ C = (2) [] ]
|
||||
[ ]
|
||||
[ D -> (1) ]
|
||||
[ E -> (2) ]
|
||||
>>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')
|
||||
>>> fs2 = nltk.FeatStruct(CITY='Paris')
|
||||
>>> print(nltk.unify(fs1, fs2))
|
||||
[ CITY = 'Paris' ]
|
||||
[ NUMBER = 74 ]
|
||||
[ STREET = 'rue Pascal' ]
|
||||
|
||||
Unification is symmetric:
|
||||
|
||||
>>> nltk.unify(fs1, fs2) == nltk.unify(fs2, fs1)
|
||||
True
|
||||
|
||||
Unification is commutative:
|
||||
|
||||
>>> fs3 = nltk.FeatStruct(TELNO='01 27 86 42 96')
|
||||
>>> nltk.unify(nltk.unify(fs1, fs2), fs3) == nltk.unify(fs1, nltk.unify(fs2, fs3))
|
||||
True
|
||||
|
||||
Unification between *FS*:math:`_0` and *FS*:math:`_1` will fail if the
|
||||
two feature structures share a path |pi|,
|
||||
but the value of |pi| in *FS*:math:`_0` is a distinct
|
||||
atom from the value of |pi| in *FS*:math:`_1`. In NLTK,
|
||||
this is implemented by setting the result of unification to be
|
||||
``None``.
|
||||
|
||||
>>> fs0 = nltk.FeatStruct(A='a')
|
||||
>>> fs1 = nltk.FeatStruct(A='b')
|
||||
>>> print(nltk.unify(fs0, fs1))
|
||||
None
|
||||
|
||||
Now, if we look at how unification interacts with structure-sharing,
|
||||
things become really interesting.
|
||||
|
||||
|
||||
|
||||
>>> fs0 = nltk.FeatStruct("""[NAME=Lee,
|
||||
... ADDRESS=[NUMBER=74,
|
||||
... STREET='rue Pascal'],
|
||||
... SPOUSE= [NAME=Kim,
|
||||
... ADDRESS=[NUMBER=74,
|
||||
... STREET='rue Pascal']]]""")
|
||||
>>> print(fs0)
|
||||
[ ADDRESS = [ NUMBER = 74 ] ]
|
||||
[ [ STREET = 'rue Pascal' ] ]
|
||||
[ ]
|
||||
[ NAME = 'Lee' ]
|
||||
[ ]
|
||||
[ [ ADDRESS = [ NUMBER = 74 ] ] ]
|
||||
[ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ]
|
||||
[ [ ] ]
|
||||
[ [ NAME = 'Kim' ] ]
|
||||
|
||||
|
||||
>>> fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]")
|
||||
>>> print(nltk.unify(fs0, fs1))
|
||||
[ ADDRESS = [ NUMBER = 74 ] ]
|
||||
[ [ STREET = 'rue Pascal' ] ]
|
||||
[ ]
|
||||
[ NAME = 'Lee' ]
|
||||
[ ]
|
||||
[ [ [ CITY = 'Paris' ] ] ]
|
||||
[ [ ADDRESS = [ NUMBER = 74 ] ] ]
|
||||
[ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ]
|
||||
[ [ ] ]
|
||||
[ [ NAME = 'Kim' ] ]
|
||||
|
||||
>>> fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
|
||||
... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""")
|
||||
|
||||
|
||||
>>> print(fs2)
|
||||
[ ADDRESS = (1) [ NUMBER = 74 ] ]
|
||||
[ [ STREET = 'rue Pascal' ] ]
|
||||
[ ]
|
||||
[ NAME = 'Lee' ]
|
||||
[ ]
|
||||
[ SPOUSE = [ ADDRESS -> (1) ] ]
|
||||
[ [ NAME = 'Kim' ] ]
|
||||
|
||||
|
||||
>>> print(nltk.unify(fs2, fs1))
|
||||
[ [ CITY = 'Paris' ] ]
|
||||
[ ADDRESS = (1) [ NUMBER = 74 ] ]
|
||||
[ [ STREET = 'rue Pascal' ] ]
|
||||
[ ]
|
||||
[ NAME = 'Lee' ]
|
||||
[ ]
|
||||
[ SPOUSE = [ ADDRESS -> (1) ] ]
|
||||
[ [ NAME = 'Kim' ] ]
|
||||
|
||||
|
||||
>>> fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]")
|
||||
>>> fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]")
|
||||
>>> print(fs2)
|
||||
[ ADDRESS1 = ?x ]
|
||||
[ ADDRESS2 = ?x ]
|
||||
>>> print(nltk.unify(fs1, fs2))
|
||||
[ ADDRESS1 = (1) [ NUMBER = 74 ] ]
|
||||
[ [ STREET = 'rue Pascal' ] ]
|
||||
[ ]
|
||||
[ ADDRESS2 -> (1) ]
|
||||
|
||||
|
||||
|
||||
|
||||
>>> sent = 'who do you claim that you like'
|
||||
>>> tokens = sent.split()
|
||||
>>> cp = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1)
|
||||
>>> trees = cp.parse(tokens)
|
||||
|.w.d.y.c.t.y.l.|
|
||||
|[-] . . . . . .| [0:1] 'who'
|
||||
|. [-] . . . . .| [1:2] 'do'
|
||||
|. . [-] . . . .| [2:3] 'you'
|
||||
|. . . [-] . . .| [3:4] 'claim'
|
||||
|. . . . [-] . .| [4:5] 'that'
|
||||
|. . . . . [-] .| [5:6] 'you'
|
||||
|. . . . . . [-]| [6:7] 'like'
|
||||
|# . . . . . . .| [0:0] NP[]/NP[] -> *
|
||||
|. # . . . . . .| [1:1] NP[]/NP[] -> *
|
||||
|. . # . . . . .| [2:2] NP[]/NP[] -> *
|
||||
|. . . # . . . .| [3:3] NP[]/NP[] -> *
|
||||
|. . . . # . . .| [4:4] NP[]/NP[] -> *
|
||||
|. . . . . # . .| [5:5] NP[]/NP[] -> *
|
||||
|. . . . . . # .| [6:6] NP[]/NP[] -> *
|
||||
|. . . . . . . #| [7:7] NP[]/NP[] -> *
|
||||
|[-] . . . . . .| [0:1] NP[+WH] -> 'who' *
|
||||
|[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {}
|
||||
|[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
|
||||
|[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {}
|
||||
|. [-] . . . . .| [1:2] V[+AUX] -> 'do' *
|
||||
|. [-> . . . . .| [1:2] S[+INV] -> V[+AUX] * NP[] VP[] {}
|
||||
|. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {}
|
||||
|. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {}
|
||||
|. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {}
|
||||
|. . [-] . . . .| [2:3] NP[-WH] -> 'you' *
|
||||
|. . [-> . . . .| [2:3] S[-INV] -> NP[] * VP[] {}
|
||||
|. . [-> . . . .| [2:3] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
|
||||
|. . [-> . . . .| [2:3] S[-INV] -> NP[] * S[]/NP[] {}
|
||||
|. [---> . . . .| [1:3] S[+INV] -> V[+AUX] NP[] * VP[] {}
|
||||
|. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {}
|
||||
|. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' *
|
||||
|. . . [-> . . .| [3:4] VP[] -> V[-AUX, SUBCAT='clause'] * SBar[] {}
|
||||
|. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {}
|
||||
|. . . . [-] . .| [4:5] Comp[] -> 'that' *
|
||||
|. . . . [-> . .| [4:5] SBar[] -> Comp[] * S[-INV] {}
|
||||
|. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {}
|
||||
|. . . . . [-] .| [5:6] NP[-WH] -> 'you' *
|
||||
|. . . . . [-> .| [5:6] S[-INV] -> NP[] * VP[] {}
|
||||
|. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
|
||||
|. . . . . [-> .| [5:6] S[-INV] -> NP[] * S[]/NP[] {}
|
||||
|. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' *
|
||||
|. . . . . . [->| [6:7] VP[] -> V[-AUX, SUBCAT='trans'] * NP[] {}
|
||||
|. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {}
|
||||
|. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] *
|
||||
|. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] *
|
||||
|. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] *
|
||||
|. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] *
|
||||
|. . [---------]| [2:7] S[-INV]/NP[] -> NP[] VP[]/NP[] *
|
||||
|. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] *
|
||||
|[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] *
|
||||
|
||||
>>> trees = list(trees)
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[-INV]
|
||||
(NP[+WH] who)
|
||||
(S[+INV]/NP[]
|
||||
(V[+AUX] do)
|
||||
(NP[-WH] you)
|
||||
(VP[]/NP[]
|
||||
(V[-AUX, SUBCAT='clause'] claim)
|
||||
(SBar[]/NP[]
|
||||
(Comp[] that)
|
||||
(S[-INV]/NP[]
|
||||
(NP[-WH] you)
|
||||
(VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] )))))))
|
||||
|
||||
A different parser should give the same parse trees, but perhaps in a different order:
|
||||
|
||||
>>> cp2 = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1,
|
||||
... parser=parse.FeatureEarleyChartParser)
|
||||
>>> trees2 = cp2.parse(tokens)
|
||||
|.w.d.y.c.t.y.l.|
|
||||
|[-] . . . . . .| [0:1] 'who'
|
||||
|. [-] . . . . .| [1:2] 'do'
|
||||
|. . [-] . . . .| [2:3] 'you'
|
||||
|. . . [-] . . .| [3:4] 'claim'
|
||||
|. . . . [-] . .| [4:5] 'that'
|
||||
|. . . . . [-] .| [5:6] 'you'
|
||||
|. . . . . . [-]| [6:7] 'like'
|
||||
|> . . . . . . .| [0:0] S[-INV] -> * NP[] VP[] {}
|
||||
|> . . . . . . .| [0:0] S[-INV]/?x[] -> * NP[] VP[]/?x[] {}
|
||||
|> . . . . . . .| [0:0] S[-INV] -> * NP[] S[]/NP[] {}
|
||||
|> . . . . . . .| [0:0] S[-INV] -> * Adv[+NEG] S[+INV] {}
|
||||
|> . . . . . . .| [0:0] S[+INV] -> * V[+AUX] NP[] VP[] {}
|
||||
|> . . . . . . .| [0:0] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {}
|
||||
|> . . . . . . .| [0:0] NP[+WH] -> * 'who' {}
|
||||
|[-] . . . . . .| [0:1] NP[+WH] -> 'who' *
|
||||
|[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {}
|
||||
|[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
|
||||
|[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {}
|
||||
|. > . . . . . .| [1:1] S[-INV]/?x[] -> * NP[] VP[]/?x[] {}
|
||||
|. > . . . . . .| [1:1] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {}
|
||||
|. > . . . . . .| [1:1] V[+AUX] -> * 'do' {}
|
||||
|. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
|
||||
|. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
|
||||
|. > . . . . . .| [1:1] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
|
||||
|. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='intrans'] {}
|
||||
|. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {}
|
||||
|. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {}
|
||||
|. > . . . . . .| [1:1] VP[] -> * V[+AUX] VP[] {}
|
||||
|. [-] . . . . .| [1:2] V[+AUX] -> 'do' *
|
||||
|. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {}
|
||||
|. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {}
|
||||
|. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {}
|
||||
|. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='intrans'] {}
|
||||
|. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {}
|
||||
|. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {}
|
||||
|. . > . . . . .| [2:2] VP[] -> * V[+AUX] VP[] {}
|
||||
|. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
|
||||
|. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
|
||||
|. . > . . . . .| [2:2] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
|
||||
|. . > . . . . .| [2:2] NP[-WH] -> * 'you' {}
|
||||
|. . [-] . . . .| [2:3] NP[-WH] -> 'you' *
|
||||
|. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {}
|
||||
|. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
|
||||
|. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
|
||||
|. . . > . . . .| [3:3] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
|
||||
|. . . > . . . .| [3:3] V[-AUX, SUBCAT='clause'] -> * 'claim' {}
|
||||
|. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' *
|
||||
|. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {}
|
||||
|. . . . > . . .| [4:4] SBar[]/?x[] -> * Comp[] S[-INV]/?x[] {}
|
||||
|. . . . > . . .| [4:4] Comp[] -> * 'that' {}
|
||||
|. . . . [-] . .| [4:5] Comp[] -> 'that' *
|
||||
|. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {}
|
||||
|. . . . . > . .| [5:5] S[-INV]/?x[] -> * NP[] VP[]/?x[] {}
|
||||
|. . . . . > . .| [5:5] NP[-WH] -> * 'you' {}
|
||||
|. . . . . [-] .| [5:6] NP[-WH] -> 'you' *
|
||||
|. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {}
|
||||
|. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {}
|
||||
|. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {}
|
||||
|. . . . . . > .| [6:6] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {}
|
||||
|. . . . . . > .| [6:6] V[-AUX, SUBCAT='trans'] -> * 'like' {}
|
||||
|. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' *
|
||||
|. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {}
|
||||
|. . . . . . . #| [7:7] NP[]/NP[] -> *
|
||||
|. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] *
|
||||
|. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] *
|
||||
|. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] *
|
||||
|. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] *
|
||||
|. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] *
|
||||
|[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] *
|
||||
|
||||
>>> sorted(trees) == sorted(trees2)
|
||||
True
|
||||
|
||||
|
||||
Let's load a German grammar:
|
||||
|
||||
>>> cp = parse.load_parser('grammars/book_grammars/german.fcfg', trace=0)
|
||||
>>> sent = 'die Katze sieht den Hund'
|
||||
>>> tokens = sent.split()
|
||||
>>> trees = cp.parse(tokens)
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[]
|
||||
(NP[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom']
|
||||
(Det[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] die)
|
||||
(N[AGR=[GND='fem', NUM='sg', PER=3]] Katze))
|
||||
(VP[AGR=[NUM='sg', PER=3]]
|
||||
(TV[AGR=[NUM='sg', PER=3], OBJCASE='acc'] sieht)
|
||||
(NP[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc']
|
||||
(Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] den)
|
||||
(N[AGR=[GND='masc', NUM='sg', PER=3]] Hund))))
|
||||
|
||||
Grammar with Binding Operators
|
||||
------------------------------
|
||||
The bindop.fcfg grammar is a semantic grammar that uses lambda
|
||||
calculus. Each element has a core semantics, which is a single lambda
|
||||
calculus expression; and a set of binding operators, which bind
|
||||
variables.
|
||||
|
||||
In order to make the binding operators work right, they need to
|
||||
instantiate their bound variable every time they are added to the
|
||||
chart. To do this, we use a special subclass of `Chart`, called
|
||||
`InstantiateVarsChart`.
|
||||
|
||||
>>> from nltk.parse.featurechart import InstantiateVarsChart
|
||||
>>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=1,
|
||||
... chart_class=InstantiateVarsChart)
|
||||
>>> print(cp.grammar())
|
||||
Grammar with 15 productions (start state = S[])
|
||||
S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] VP[SEM=[BO=?b2, CORE=?vp]]
|
||||
VP[SEM=[BO={?b1+?b2}, CORE=<?v(?obj)>]] -> TV[SEM=[BO=?b1, CORE=?v]] NP[SEM=[BO=?b2, CORE=?obj]]
|
||||
VP[SEM=?s] -> IV[SEM=?s]
|
||||
NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] N[SEM=[BO=?b2, CORE=?n]]
|
||||
Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a'
|
||||
N[SEM=[BO={/}, CORE=<dog>]] -> 'dog'
|
||||
N[SEM=[BO={/}, CORE=<dog>]] -> 'cat'
|
||||
N[SEM=[BO={/}, CORE=<dog>]] -> 'mouse'
|
||||
IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks'
|
||||
IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'eats'
|
||||
IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'walks'
|
||||
TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds'
|
||||
TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'walks'
|
||||
NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'john'
|
||||
NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'alex'
|
||||
|
||||
A simple intransitive sentence:
|
||||
|
||||
>>> from nltk.sem import logic
|
||||
>>> logic._counter._value = 100
|
||||
|
||||
>>> trees = cp.parse('john barks'.split())
|
||||
|. john.barks.|
|
||||
|[-----] .| [0:1] 'john'
|
||||
|. [-----]| [1:2] 'barks'
|
||||
|[-----] .| [0:1] NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=<z101>]] -> 'john' *
|
||||
|[-----> .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: <IndividualVariableExpression z2>}
|
||||
|. [-----]| [1:2] IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' *
|
||||
|. [-----]| [1:2] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] *
|
||||
|[===========]| [0:2] S[SEM=[BO={bo(\P.P(John),z2)}, CORE=<bark(z2)>]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<z2>]] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] *
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[SEM=[BO={bo(\P.P(John),z2)}, CORE=<bark(z2)>]]
|
||||
(NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=<z101>]] john)
|
||||
(VP[SEM=[BO={/}, CORE=<\x.bark(x)>]]
|
||||
(IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] barks)))
|
||||
|
||||
A transitive sentence:
|
||||
|
||||
>>> trees = cp.parse('john feeds a dog'.split())
|
||||
|.joh.fee. a .dog.|
|
||||
|[---] . . .| [0:1] 'john'
|
||||
|. [---] . .| [1:2] 'feeds'
|
||||
|. . [---] .| [2:3] 'a'
|
||||
|. . . [---]| [3:4] 'dog'
|
||||
|[---] . . .| [0:1] NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=<z102>]] -> 'john' *
|
||||
|[---> . . .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: <IndividualVariableExpression z2>}
|
||||
|. [---] . .| [1:2] TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' *
|
||||
|. [---> . .| [1:2] VP[SEM=[BO={?b1+?b2}, CORE=<?v(?obj)>]] -> TV[SEM=[BO=?b1, CORE=?v]] * NP[SEM=[BO=?b2, CORE=?obj]] {?b1: {/}, ?v: <LambdaExpression \x y.feed(y,x)>}
|
||||
|. . [---] .| [2:3] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' *
|
||||
|. . [---> .| [2:3] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] * N[SEM=[BO=?b2, CORE=?n]] {?b1: {/}, ?det: <LambdaExpression \Q P.exists x.(Q(x) & P(x))>}
|
||||
|. . . [---]| [3:4] N[SEM=[BO={/}, CORE=<dog>]] -> 'dog' *
|
||||
|. . [-------]| [2:4] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=<z103>]] -> Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] N[SEM=[BO={/}, CORE=<dog>]] *
|
||||
|. . [------->| [2:4] S[SEM=[BO={?b1+?b2}, CORE=<?vp(?subj)>]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.exists x.(dog(x) & P(x)),z2)}, ?subj: <IndividualVariableExpression z2>}
|
||||
|. [-----------]| [1:4] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] -> TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<z2>]] *
|
||||
|[===============]| [0:4] S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<feed(z2,z3)>]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<z2>]] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<\y.feed(y,z3)>]] *
|
||||
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<feed(z2,z3)>]]
|
||||
(NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=<z102>]] john)
|
||||
(VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]]
|
||||
(TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds)
|
||||
(NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=<z103>]]
|
||||
(Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a)
|
||||
(N[SEM=[BO={/}, CORE=<dog>]] dog))))
|
||||
|
||||
Turn down the verbosity:
|
||||
|
||||
>>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=0,
|
||||
... chart_class=InstantiateVarsChart)
|
||||
|
||||
Reuse the same lexical item twice:
|
||||
|
||||
>>> trees = cp.parse('john feeds john'.split())
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[SEM=[BO={bo(\P.P(John),z2), bo(\P.P(John),z3)}, CORE=<feed(z2,z3)>]]
|
||||
(NP[SEM=[BO={bo(\P.P(John),z104)}, CORE=<z104>]] john)
|
||||
(VP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<\y.feed(y,z2)>]]
|
||||
(TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds)
|
||||
(NP[SEM=[BO={bo(\P.P(John),z105)}, CORE=<z105>]] john)))
|
||||
|
||||
>>> trees = cp.parse('a dog feeds a dog'.split())
|
||||
>>> for tree in trees: print(tree)
|
||||
(S[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<feed(z2,z3)>]]
|
||||
(NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z106)}, CORE=<z106>]]
|
||||
(Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a)
|
||||
(N[SEM=[BO={/}, CORE=<dog>]] dog))
|
||||
(VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]]
|
||||
(TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds)
|
||||
(NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z107)}, CORE=<z107>]]
|
||||
(Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a)
|
||||
(N[SEM=[BO={/}, CORE=<dog>]] dog))))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,288 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
========
|
||||
FrameNet
|
||||
========
|
||||
|
||||
The FrameNet corpus is a lexical database of English that is both human-
|
||||
and machine-readable, based on annotating examples of how words are used
|
||||
in actual texts. FrameNet is based on a theory of meaning called Frame
|
||||
Semantics, deriving from the work of Charles J. Fillmore and colleagues.
|
||||
The basic idea is straightforward: that the meanings of most words can
|
||||
best be understood on the basis of a semantic frame: a description of a
|
||||
type of event, relation, or entity and the participants in it. For
|
||||
example, the concept of cooking typically involves a person doing the
|
||||
cooking (Cook), the food that is to be cooked (Food), something to hold
|
||||
the food while cooking (Container) and a source of heat
|
||||
(Heating_instrument). In the FrameNet project, this is represented as a
|
||||
frame called Apply_heat, and the Cook, Food, Heating_instrument and
|
||||
Container are called frame elements (FEs). Words that evoke this frame,
|
||||
such as fry, bake, boil, and broil, are called lexical units (LUs) of
|
||||
the Apply_heat frame. The job of FrameNet is to define the frames
|
||||
and to annotate sentences to show how the FEs fit syntactically around
|
||||
the word that evokes the frame.
|
||||
|
||||
------
|
||||
Frames
|
||||
------
|
||||
|
||||
A Frame is a script-like conceptual structure that describes a
|
||||
particular type of situation, object, or event along with the
|
||||
participants and props that are needed for that Frame. For
|
||||
example, the "Apply_heat" frame describes a common situation
|
||||
involving a Cook, some Food, and a Heating_Instrument, and is
|
||||
evoked by words such as bake, blanch, boil, broil, brown,
|
||||
simmer, steam, etc.
|
||||
|
||||
We call the roles of a Frame "frame elements" (FEs) and the
|
||||
frame-evoking words are called "lexical units" (LUs).
|
||||
|
||||
FrameNet includes relations between Frames. Several types of
|
||||
relations are defined, of which the most important are:
|
||||
|
||||
- Inheritance: An IS-A relation. The child frame is a subtype
|
||||
of the parent frame, and each FE in the parent is bound to
|
||||
a corresponding FE in the child. An example is the
|
||||
"Revenge" frame which inherits from the
|
||||
"Rewards_and_punishments" frame.
|
||||
|
||||
- Using: The child frame presupposes the parent frame as
|
||||
background, e.g the "Speed" frame "uses" (or presupposes)
|
||||
the "Motion" frame; however, not all parent FEs need to be
|
||||
bound to child FEs.
|
||||
|
||||
- Subframe: The child frame is a subevent of a complex event
|
||||
represented by the parent, e.g. the "Criminal_process" frame
|
||||
has subframes of "Arrest", "Arraignment", "Trial", and
|
||||
"Sentencing".
|
||||
|
||||
- Perspective_on: The child frame provides a particular
|
||||
perspective on an un-perspectivized parent frame. A pair of
|
||||
examples consists of the "Hiring" and "Get_a_job" frames,
|
||||
which perspectivize the "Employment_start" frame from the
|
||||
Employer's and the Employee's point of view, respectively.
|
||||
|
||||
To get a list of all of the Frames in FrameNet, you can use the
|
||||
`frames()` function. If you supply a regular expression pattern to the
|
||||
`frames()` function, you will get a list of all Frames whose names match
|
||||
that pattern:
|
||||
|
||||
>>> from pprint import pprint
|
||||
>>> from operator import itemgetter
|
||||
>>> from nltk.corpus import framenet as fn
|
||||
>>> from nltk.corpus.reader.framenet import PrettyList
|
||||
>>> x = fn.frames(r'(?i)crim')
|
||||
>>> x.sort(key=itemgetter('ID'))
|
||||
>>> x
|
||||
[<frame ID=200 name=Criminal_process>, <frame ID=500 name=Criminal_investigation>, ...]
|
||||
>>> PrettyList(sorted(x, key=itemgetter('ID')))
|
||||
[<frame ID=200 name=Criminal_process>, <frame ID=500 name=Criminal_investigation>, ...]
|
||||
|
||||
To get the details of a particular Frame, you can use the `frame()`
|
||||
function passing in the frame number:
|
||||
|
||||
>>> from pprint import pprint
|
||||
>>> from nltk.corpus import framenet as fn
|
||||
>>> f = fn.frame(202)
|
||||
>>> f.ID
|
||||
202
|
||||
>>> f.name
|
||||
'Arrest'
|
||||
>>> f.definition
|
||||
"Authorities charge a Suspect, who is under suspicion of having committed a crime..."
|
||||
>>> len(f.lexUnit)
|
||||
11
|
||||
>>> pprint(sorted([x for x in f.FE]))
|
||||
['Authorities',
|
||||
'Charges',
|
||||
'Co-participant',
|
||||
'Manner',
|
||||
'Means',
|
||||
'Offense',
|
||||
'Place',
|
||||
'Purpose',
|
||||
'Source_of_legal_authority',
|
||||
'Suspect',
|
||||
'Time',
|
||||
'Type']
|
||||
>>> pprint(f.frameRelations)
|
||||
[<Parent=Intentionally_affect -- Inheritance -> Child=Arrest>, <Complex=Criminal_process -- Subframe -> Component=Arrest>, ...]
|
||||
|
||||
The `frame()` function shown above returns a dict object containing
|
||||
detailed information about the Frame. See the documentation on the
|
||||
`frame()` function for the specifics.
|
||||
|
||||
You can also search for Frames by their Lexical Units (LUs). The
|
||||
`frames_by_lemma()` function returns a list of all frames that contain
|
||||
LUs in which the 'name' attribute of the LU matches the given regular
|
||||
expression. Note that LU names are composed of "lemma.POS", where the
|
||||
"lemma" part can be made up of either a single lexeme (e.g. 'run') or
|
||||
multiple lexemes (e.g. 'a little') (see below).
|
||||
|
||||
>>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID')))
|
||||
[<frame ID=189 name=Quanti...>, <frame ID=2001 name=Degree>]
|
||||
|
||||
-------------
|
||||
Lexical Units
|
||||
-------------
|
||||
|
||||
A lexical unit (LU) is a pairing of a word with a meaning. For
|
||||
example, the "Apply_heat" Frame describes a common situation
|
||||
involving a Cook, some Food, and a Heating Instrument, and is
|
||||
_evoked_ by words such as bake, blanch, boil, broil, brown,
|
||||
simmer, steam, etc. These frame-evoking words are the LUs in the
|
||||
Apply_heat frame. Each sense of a polysemous word is a different
|
||||
LU.
|
||||
|
||||
We have used the word "word" in talking about LUs. The reality
|
||||
is actually rather complex. When we say that the word "bake" is
|
||||
polysemous, we mean that the lemma "bake.v" (which has the
|
||||
word-forms "bake", "bakes", "baked", and "baking") is linked to
|
||||
three different frames:
|
||||
|
||||
- Apply_heat: "Michelle baked the potatoes for 45 minutes."
|
||||
|
||||
- Cooking_creation: "Michelle baked her mother a cake for her birthday."
|
||||
|
||||
- Absorb_heat: "The potatoes have to bake for more than 30 minutes."
|
||||
|
||||
These constitute three different LUs, with different
|
||||
definitions.
|
||||
|
||||
Multiword expressions such as "given name" and hyphenated words
|
||||
like "shut-eye" can also be LUs. Idiomatic phrases such as
|
||||
"middle of nowhere" and "give the slip (to)" are also defined as
|
||||
LUs in the appropriate frames ("Isolated_places" and "Evading",
|
||||
respectively), and their internal structure is not analyzed.
|
||||
|
||||
Framenet provides multiple annotated examples of each sense of a
|
||||
word (i.e. each LU). Moreover, the set of examples
|
||||
(approximately 20 per LU) illustrates all of the combinatorial
|
||||
possibilities of the lexical unit.
|
||||
|
||||
Each LU is linked to a Frame, and hence to the other words which
|
||||
evoke that Frame. This makes the FrameNet database similar to a
|
||||
thesaurus, grouping together semantically similar words.
|
||||
|
||||
In the simplest case, frame-evoking words are verbs such as
|
||||
"fried" in:
|
||||
|
||||
"Matilde fried the catfish in a heavy iron skillet."
|
||||
|
||||
Sometimes event nouns may evoke a Frame. For example,
|
||||
"reduction" evokes "Cause_change_of_scalar_position" in:
|
||||
|
||||
"...the reduction of debt levels to $665 million from $2.6 billion."
|
||||
|
||||
Adjectives may also evoke a Frame. For example, "asleep" may
|
||||
evoke the "Sleep" frame as in:
|
||||
|
||||
"They were asleep for hours."
|
||||
|
||||
Many common nouns, such as artifacts like "hat" or "tower",
|
||||
typically serve as dependents rather than clearly evoking their
|
||||
own frames.
|
||||
|
||||
Details for a specific lexical unit can be obtained using this class's
|
||||
`lus()` function, which takes an optional regular expression
|
||||
pattern that will be matched against the name of the lexical unit:
|
||||
|
||||
>>> from pprint import pprint
|
||||
>>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')))
|
||||
[<lu ID=14733 name=a little.n>, <lu ID=14743 name=a little.adv>, ...]
|
||||
|
||||
You can obtain detailed information on a particular LU by calling the
|
||||
`lu()` function and passing in an LU's 'ID' number:
|
||||
|
||||
>>> from pprint import pprint
|
||||
>>> from nltk.corpus import framenet as fn
|
||||
>>> fn.lu(256).name
|
||||
'foresee.v'
|
||||
>>> fn.lu(256).definition
|
||||
'COD: be aware of beforehand; predict.'
|
||||
>>> fn.lu(256).frame.name
|
||||
'Expectation'
|
||||
>>> fn.lu(256).lexemes[0].name
|
||||
'foresee'
|
||||
|
||||
Note that LU names take the form of a dotted string (e.g. "run.v" or "a
|
||||
little.adv") in which a lemma precedes the "." and a part of speech
|
||||
(POS) follows the dot. The lemma may be composed of a single lexeme
|
||||
(e.g. "run") or of multiple lexemes (e.g. "a little"). The list of
|
||||
POSs used in the LUs is:
|
||||
|
||||
v - verb
|
||||
n - noun
|
||||
a - adjective
|
||||
adv - adverb
|
||||
prep - preposition
|
||||
num - numbers
|
||||
intj - interjection
|
||||
art - article
|
||||
c - conjunction
|
||||
scon - subordinating conjunction
|
||||
|
||||
For more detailed information about the info that is contained in the
|
||||
dict that is returned by the `lu()` function, see the documentation on
|
||||
the `lu()` function.
|
||||
|
||||
-------------------
|
||||
Annotated Documents
|
||||
-------------------
|
||||
|
||||
The FrameNet corpus contains a small set of annotated documents. A list
|
||||
of these documents can be obtained by calling the `docs()` function:
|
||||
|
||||
>>> from pprint import pprint
|
||||
>>> from nltk.corpus import framenet as fn
|
||||
>>> d = fn.docs('BellRinging')[0]
|
||||
>>> d.corpname
|
||||
'PropBank'
|
||||
>>> d.sentence[49]
|
||||
full-text sentence (...) in BellRinging:
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
[POS] 17 tags
|
||||
<BLANKLINE>
|
||||
[POS_tagset] PENN
|
||||
<BLANKLINE>
|
||||
[text] + [annotationSet]
|
||||
<BLANKLINE>
|
||||
`` I live in hopes that the ringers themselves will be drawn into
|
||||
***** ******* *****
|
||||
Desir Cause_t Cause
|
||||
[1] [3] [2]
|
||||
<BLANKLINE>
|
||||
that fuller life .
|
||||
******
|
||||
Comple
|
||||
[4]
|
||||
(Desir=Desiring, Cause_t=Cause_to_make_noise, Cause=Cause_motion, Comple=Completeness)
|
||||
<BLANKLINE>
|
||||
|
||||
>>> d.sentence[49].annotationSet[1]
|
||||
annotation set (...):
|
||||
<BLANKLINE>
|
||||
[status] MANUAL
|
||||
<BLANKLINE>
|
||||
[LU] (6605) hope.n in Desiring
|
||||
<BLANKLINE>
|
||||
[frame] (366) Desiring
|
||||
<BLANKLINE>
|
||||
[GF] 2 relations
|
||||
<BLANKLINE>
|
||||
[PT] 2 phrases
|
||||
<BLANKLINE>
|
||||
[text] + [Target] + [FE] + [Noun]
|
||||
<BLANKLINE>
|
||||
`` I live in hopes that the ringers themselves will be drawn into
|
||||
- ^^^^ ^^ ***** ----------------------------------------------
|
||||
E supp su Event
|
||||
<BLANKLINE>
|
||||
that fuller life .
|
||||
-----------------
|
||||
<BLANKLINE>
|
||||
(E=Experiencer, su=supp)
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
@@ -0,0 +1,66 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===============================================
|
||||
Generating sentences from context-free grammars
|
||||
===============================================
|
||||
|
||||
An example grammar:
|
||||
|
||||
>>> from nltk.parse.generate import generate, demo_grammar
|
||||
>>> from nltk import CFG
|
||||
>>> grammar = CFG.fromstring(demo_grammar)
|
||||
>>> print(grammar)
|
||||
Grammar with 13 productions (start state = S)
|
||||
S -> NP VP
|
||||
NP -> Det N
|
||||
PP -> P NP
|
||||
VP -> 'slept'
|
||||
VP -> 'saw' NP
|
||||
VP -> 'walked' PP
|
||||
Det -> 'the'
|
||||
Det -> 'a'
|
||||
N -> 'man'
|
||||
N -> 'park'
|
||||
N -> 'dog'
|
||||
P -> 'in'
|
||||
P -> 'with'
|
||||
|
||||
The first 10 generated sentences:
|
||||
|
||||
>>> for sentence in generate(grammar, n=10):
|
||||
... print(' '.join(sentence))
|
||||
the man slept
|
||||
the man saw the man
|
||||
the man saw the park
|
||||
the man saw the dog
|
||||
the man saw a man
|
||||
the man saw a park
|
||||
the man saw a dog
|
||||
the man walked in the man
|
||||
the man walked in the park
|
||||
the man walked in the dog
|
||||
|
||||
All sentences of max depth 4:
|
||||
|
||||
>>> for sentence in generate(grammar, depth=4):
|
||||
... print(' '.join(sentence))
|
||||
the man slept
|
||||
the park slept
|
||||
the dog slept
|
||||
a man slept
|
||||
a park slept
|
||||
a dog slept
|
||||
|
||||
The number of sentences of different max depths:
|
||||
|
||||
>>> len(list(generate(grammar, depth=3)))
|
||||
0
|
||||
>>> len(list(generate(grammar, depth=4)))
|
||||
6
|
||||
>>> len(list(generate(grammar, depth=5)))
|
||||
42
|
||||
>>> len(list(generate(grammar, depth=6)))
|
||||
114
|
||||
>>> len(list(generate(grammar)))
|
||||
114
|
||||
@@ -0,0 +1,141 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=======================================
|
||||
Demonstrate word embedding using Gensim
|
||||
=======================================
|
||||
|
||||
>>> from nltk.test.gensim_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
We demonstrate three functions:
|
||||
- Train the word embeddings using brown corpus;
|
||||
- Load the pre-trained model and perform simple tasks; and
|
||||
- Pruning the pre-trained binary model.
|
||||
|
||||
>>> import gensim
|
||||
|
||||
---------------
|
||||
Train the model
|
||||
---------------
|
||||
|
||||
Here we train a word embedding using the Brown Corpus:
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> train_set = brown.sents()[:10000]
|
||||
>>> model = gensim.models.Word2Vec(train_set)
|
||||
|
||||
It might take some time to train the model. So, after it is trained, it can be saved as follows:
|
||||
|
||||
>>> model.save('brown.embedding')
|
||||
>>> new_model = gensim.models.Word2Vec.load('brown.embedding')
|
||||
|
||||
The model will be the list of words with their embedding. We can easily get the vector representation of a word.
|
||||
|
||||
>>> len(new_model.wv['university'])
|
||||
100
|
||||
|
||||
There are some supporting functions already implemented in Gensim to manipulate with word embeddings.
|
||||
For example, to compute the cosine similarity between 2 words:
|
||||
|
||||
>>> new_model.wv.similarity('university','school') > 0.3
|
||||
True
|
||||
|
||||
---------------------------
|
||||
Using the pre-trained model
|
||||
---------------------------
|
||||
|
||||
NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset.
|
||||
The full model is from https://code.google.com/p/word2vec/ (about 3 GB).
|
||||
|
||||
>>> from nltk.data import find
|
||||
>>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
|
||||
>>> model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)
|
||||
|
||||
We pruned the model to only include the most common words (~44k words).
|
||||
|
||||
>>> len(model)
|
||||
43981
|
||||
|
||||
Each word is represented in the space of 300 dimensions:
|
||||
|
||||
>>> len(model['university'])
|
||||
300
|
||||
|
||||
Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score.
|
||||
|
||||
>>> model.most_similar(positive=['university'], topn = 3)
|
||||
[('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)]
|
||||
|
||||
Finding a word that is not in a list is also supported, although, implementing this by yourself is simple.
|
||||
|
||||
>>> model.doesnt_match('breakfast cereal dinner lunch'.split())
|
||||
'cereal'
|
||||
|
||||
Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example,
|
||||
the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'.
|
||||
|
||||
>>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)
|
||||
[('queen', 0.71181...)]
|
||||
|
||||
>>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1)
|
||||
[('France', 0.78840...)]
|
||||
|
||||
We can visualize the word embeddings using t-SNE (https://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words.
|
||||
|
||||
| import numpy as np
|
||||
| labels = []
|
||||
| count = 0
|
||||
| max_count = 1000
|
||||
| X = np.zeros(shape=(max_count,len(model['university'])))
|
||||
|
|
||||
| for term in model.index_to_key:
|
||||
| X[count] = model[term]
|
||||
| labels.append(term)
|
||||
| count+= 1
|
||||
| if count >= max_count: break
|
||||
|
|
||||
| # It is recommended to use PCA first to reduce to ~50 dimensions
|
||||
| from sklearn.decomposition import PCA
|
||||
| pca = PCA(n_components=50)
|
||||
| X_50 = pca.fit_transform(X)
|
||||
|
|
||||
| # Using TSNE to further reduce to 2 dimensions
|
||||
| from sklearn.manifold import TSNE
|
||||
| model_tsne = TSNE(n_components=2, random_state=0)
|
||||
| Y = model_tsne.fit_transform(X_50)
|
||||
|
|
||||
| # Show the scatter plot
|
||||
| import matplotlib.pyplot as plt
|
||||
| plt.scatter(Y[:,0], Y[:,1], 20)
|
||||
|
|
||||
| # Add labels
|
||||
| for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
|
||||
| plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)
|
||||
|
|
||||
| plt.show()
|
||||
|
||||
------------------------------
|
||||
Prune the trained binary model
|
||||
------------------------------
|
||||
|
||||
Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/
|
||||
We use this code to get the `word2vec_sample` model.
|
||||
|
||||
| import gensim
|
||||
| # Load the binary model
|
||||
| model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
|
||||
|
|
||||
| # Only output word that appear in the Brown corpus
|
||||
| from nltk.corpus import brown
|
||||
| words = set(brown.words())
|
||||
| print(len(words))
|
||||
|
|
||||
| # Output presented word to a temporary file
|
||||
| out_file = 'pruned.word2vec.txt'
|
||||
| with open(out_file,'w') as f:
|
||||
| word_presented = words.intersection(model.index_to_key)
|
||||
| f.write('{} {}\n'.format(len(word_presented),len(model['word'])))
|
||||
|
|
||||
| for word in word_presented:
|
||||
| f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word])))
|
||||
@@ -0,0 +1,4 @@
|
||||
def setup_module():
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("gensim")
|
||||
@@ -0,0 +1,383 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==============================================================================
|
||||
Glue Semantics
|
||||
==============================================================================
|
||||
|
||||
|
||||
|
||||
======================
|
||||
Linear logic
|
||||
======================
|
||||
|
||||
>>> from nltk.sem import logic
|
||||
>>> from nltk.sem.glue import *
|
||||
>>> from nltk.sem.linearlogic import *
|
||||
|
||||
>>> from nltk.sem.linearlogic import Expression
|
||||
>>> read_expr = Expression.fromstring
|
||||
|
||||
Parser
|
||||
|
||||
>>> print(read_expr(r'f'))
|
||||
f
|
||||
>>> print(read_expr(r'(g -o f)'))
|
||||
(g -o f)
|
||||
>>> print(read_expr(r'(g -o (h -o f))'))
|
||||
(g -o (h -o f))
|
||||
>>> print(read_expr(r'((g -o G) -o G)'))
|
||||
((g -o G) -o G)
|
||||
>>> print(read_expr(r'(g -o f)(g)'))
|
||||
(g -o f)(g)
|
||||
>>> print(read_expr(r'((g -o G) -o G)((g -o f))'))
|
||||
((g -o G) -o G)((g -o f))
|
||||
|
||||
Simplify
|
||||
|
||||
>>> print(read_expr(r'f').simplify())
|
||||
f
|
||||
>>> print(read_expr(r'(g -o f)').simplify())
|
||||
(g -o f)
|
||||
>>> print(read_expr(r'((g -o G) -o G)').simplify())
|
||||
((g -o G) -o G)
|
||||
>>> print(read_expr(r'(g -o f)(g)').simplify())
|
||||
f
|
||||
>>> try: read_expr(r'(g -o f)(f)').simplify()
|
||||
... except LinearLogicApplicationException as e: print(e)
|
||||
...
|
||||
Cannot apply (g -o f) to f. Cannot unify g with f given {}
|
||||
>>> print(read_expr(r'(G -o f)(g)').simplify())
|
||||
f
|
||||
>>> print(read_expr(r'((g -o G) -o G)((g -o f))').simplify())
|
||||
f
|
||||
|
||||
Test BindingDict
|
||||
|
||||
>>> h = ConstantExpression('h')
|
||||
>>> g = ConstantExpression('g')
|
||||
>>> f = ConstantExpression('f')
|
||||
|
||||
>>> H = VariableExpression('H')
|
||||
>>> G = VariableExpression('G')
|
||||
>>> F = VariableExpression('F')
|
||||
|
||||
>>> d1 = BindingDict({H: h})
|
||||
>>> d2 = BindingDict({F: f, G: F})
|
||||
>>> d12 = d1 + d2
|
||||
>>> all12 = ['%s: %s' % (v, d12[v]) for v in d12.d]
|
||||
>>> all12.sort()
|
||||
>>> print(all12)
|
||||
['F: f', 'G: f', 'H: h']
|
||||
|
||||
>>> BindingDict([(F,f),(G,g),(H,h)]) == BindingDict({F:f, G:g, H:h})
|
||||
True
|
||||
|
||||
>>> d4 = BindingDict({F: f})
|
||||
>>> try: d4[F] = g
|
||||
... except VariableBindingException as e: print(e)
|
||||
Variable F already bound to another value
|
||||
|
||||
Test Unify
|
||||
|
||||
>>> try: f.unify(g, BindingDict())
|
||||
... except UnificationException as e: print(e)
|
||||
...
|
||||
Cannot unify f with g given {}
|
||||
|
||||
>>> f.unify(G, BindingDict()) == BindingDict({G: f})
|
||||
True
|
||||
>>> try: f.unify(G, BindingDict({G: h}))
|
||||
... except UnificationException as e: print(e)
|
||||
...
|
||||
Cannot unify f with G given {G: h}
|
||||
>>> f.unify(G, BindingDict({G: f})) == BindingDict({G: f})
|
||||
True
|
||||
>>> f.unify(G, BindingDict({H: f})) == BindingDict({G: f, H: f})
|
||||
True
|
||||
|
||||
>>> G.unify(f, BindingDict()) == BindingDict({G: f})
|
||||
True
|
||||
>>> try: G.unify(f, BindingDict({G: h}))
|
||||
... except UnificationException as e: print(e)
|
||||
...
|
||||
Cannot unify G with f given {G: h}
|
||||
>>> G.unify(f, BindingDict({G: f})) == BindingDict({G: f})
|
||||
True
|
||||
>>> G.unify(f, BindingDict({H: f})) == BindingDict({G: f, H: f})
|
||||
True
|
||||
|
||||
>>> G.unify(F, BindingDict()) == BindingDict({G: F})
|
||||
True
|
||||
>>> try: G.unify(F, BindingDict({G: H}))
|
||||
... except UnificationException as e: print(e)
|
||||
...
|
||||
Cannot unify G with F given {G: H}
|
||||
>>> G.unify(F, BindingDict({G: F})) == BindingDict({G: F})
|
||||
True
|
||||
>>> G.unify(F, BindingDict({H: F})) == BindingDict({G: F, H: F})
|
||||
True
|
||||
|
||||
Test Compile
|
||||
|
||||
>>> print(read_expr('g').compile_pos(Counter(), GlueFormula))
|
||||
(<ConstantExpression g>, [])
|
||||
>>> print(read_expr('(g -o f)').compile_pos(Counter(), GlueFormula))
|
||||
(<ImpExpression (g -o f)>, [])
|
||||
>>> print(read_expr('(g -o (h -o f))').compile_pos(Counter(), GlueFormula))
|
||||
(<ImpExpression (g -o (h -o f))>, [])
|
||||
|
||||
|
||||
======================
|
||||
Glue
|
||||
======================
|
||||
|
||||
Demo of "John walks"
|
||||
--------------------
|
||||
|
||||
>>> john = GlueFormula("John", "g")
|
||||
>>> print(john)
|
||||
John : g
|
||||
>>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)")
|
||||
>>> print(walks)
|
||||
\x.walks(x) : (g -o f)
|
||||
>>> print(walks.applyto(john))
|
||||
\x.walks(x)(John) : (g -o f)(g)
|
||||
>>> print(walks.applyto(john).simplify())
|
||||
walks(John) : f
|
||||
|
||||
|
||||
Demo of "A dog walks"
|
||||
---------------------
|
||||
|
||||
>>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((gv -o gr) -o ((g -o G) -o G))")
|
||||
>>> print(a)
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G))
|
||||
>>> man = GlueFormula(r"\x.man(x)", "(gv -o gr)")
|
||||
>>> print(man)
|
||||
\x.man(x) : (gv -o gr)
|
||||
>>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)")
|
||||
>>> print(walks)
|
||||
\x.walks(x) : (g -o f)
|
||||
>>> a_man = a.applyto(man)
|
||||
>>> print(a_man.simplify())
|
||||
\Q.exists x.(man(x) & Q(x)) : ((g -o G) -o G)
|
||||
>>> a_man_walks = a_man.applyto(walks)
|
||||
>>> print(a_man_walks.simplify())
|
||||
exists x.(man(x) & walks(x)) : f
|
||||
|
||||
|
||||
Demo of 'every girl chases a dog'
|
||||
---------------------------------
|
||||
|
||||
Individual words:
|
||||
|
||||
>>> every = GlueFormula("\\P Q.all x.(P(x) -> Q(x))", "((gv -o gr) -o ((g -o G) -o G))")
|
||||
>>> print(every)
|
||||
\P Q.all x.(P(x) -> Q(x)) : ((gv -o gr) -o ((g -o G) -o G))
|
||||
>>> girl = GlueFormula(r"\x.girl(x)", "(gv -o gr)")
|
||||
>>> print(girl)
|
||||
\x.girl(x) : (gv -o gr)
|
||||
>>> chases = GlueFormula(r"\x y.chases(x,y)", "(g -o (h -o f))")
|
||||
>>> print(chases)
|
||||
\x y.chases(x,y) : (g -o (h -o f))
|
||||
>>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((hv -o hr) -o ((h -o H) -o H))")
|
||||
>>> print(a)
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((hv -o hr) -o ((h -o H) -o H))
|
||||
>>> dog = GlueFormula(r"\x.dog(x)", "(hv -o hr)")
|
||||
>>> print(dog)
|
||||
\x.dog(x) : (hv -o hr)
|
||||
|
||||
Noun Quantification can only be done one way:
|
||||
|
||||
>>> every_girl = every.applyto(girl)
|
||||
>>> print(every_girl.simplify())
|
||||
\Q.all x.(girl(x) -> Q(x)) : ((g -o G) -o G)
|
||||
>>> a_dog = a.applyto(dog)
|
||||
>>> print(a_dog.simplify())
|
||||
\Q.exists x.(dog(x) & Q(x)) : ((h -o H) -o H)
|
||||
|
||||
The first reading is achieved by combining 'chases' with 'a dog' first.
|
||||
Since 'a girl' requires something of the form '(h -o H)' we must
|
||||
get rid of the 'g' in the glue of 'see'. We will do this with
|
||||
the '-o elimination' rule. So, x1 will be our subject placeholder.
|
||||
|
||||
>>> xPrime = GlueFormula("x1", "g")
|
||||
>>> print(xPrime)
|
||||
x1 : g
|
||||
>>> xPrime_chases = chases.applyto(xPrime)
|
||||
>>> print(xPrime_chases.simplify())
|
||||
\y.chases(x1,y) : (h -o f)
|
||||
>>> xPrime_chases_a_dog = a_dog.applyto(xPrime_chases)
|
||||
>>> print(xPrime_chases_a_dog.simplify())
|
||||
exists x.(dog(x) & chases(x1,x)) : f
|
||||
|
||||
Now we can retract our subject placeholder using lambda-abstraction and
|
||||
combine with the true subject.
|
||||
|
||||
>>> chases_a_dog = xPrime_chases_a_dog.lambda_abstract(xPrime)
|
||||
>>> print(chases_a_dog.simplify())
|
||||
\x1.exists x.(dog(x) & chases(x1,x)) : (g -o f)
|
||||
>>> every_girl_chases_a_dog = every_girl.applyto(chases_a_dog)
|
||||
>>> r1 = every_girl_chases_a_dog.simplify()
|
||||
>>> r2 = GlueFormula(r'all x.(girl(x) -> exists z1.(dog(z1) & chases(x,z1)))', 'f')
|
||||
>>> r1 == r2
|
||||
True
|
||||
|
||||
The second reading is achieved by combining 'every girl' with 'chases' first.
|
||||
|
||||
>>> xPrime = GlueFormula("x1", "g")
|
||||
>>> print(xPrime)
|
||||
x1 : g
|
||||
>>> xPrime_chases = chases.applyto(xPrime)
|
||||
>>> print(xPrime_chases.simplify())
|
||||
\y.chases(x1,y) : (h -o f)
|
||||
>>> yPrime = GlueFormula("x2", "h")
|
||||
>>> print(yPrime)
|
||||
x2 : h
|
||||
>>> xPrime_chases_yPrime = xPrime_chases.applyto(yPrime)
|
||||
>>> print(xPrime_chases_yPrime.simplify())
|
||||
chases(x1,x2) : f
|
||||
>>> chases_yPrime = xPrime_chases_yPrime.lambda_abstract(xPrime)
|
||||
>>> print(chases_yPrime.simplify())
|
||||
\x1.chases(x1,x2) : (g -o f)
|
||||
>>> every_girl_chases_yPrime = every_girl.applyto(chases_yPrime)
|
||||
>>> print(every_girl_chases_yPrime.simplify())
|
||||
all x.(girl(x) -> chases(x,x2)) : f
|
||||
>>> every_girl_chases = every_girl_chases_yPrime.lambda_abstract(yPrime)
|
||||
>>> print(every_girl_chases.simplify())
|
||||
\x2.all x.(girl(x) -> chases(x,x2)) : (h -o f)
|
||||
>>> every_girl_chases_a_dog = a_dog.applyto(every_girl_chases)
|
||||
>>> r1 = every_girl_chases_a_dog.simplify()
|
||||
>>> r2 = GlueFormula(r'exists x.(dog(x) & all z2.(girl(z2) -> chases(z2,x)))', 'f')
|
||||
>>> r1 == r2
|
||||
True
|
||||
|
||||
|
||||
Compilation
|
||||
-----------
|
||||
|
||||
>>> for cp in GlueFormula('m', '(b -o a)').compile(Counter()): print(cp)
|
||||
m : (b -o a) : {1}
|
||||
>>> for cp in GlueFormula('m', '((c -o b) -o a)').compile(Counter()): print(cp)
|
||||
v1 : c : {1}
|
||||
m : (b[1] -o a) : {2}
|
||||
>>> for cp in GlueFormula('m', '((d -o (c -o b)) -o a)').compile(Counter()): print(cp)
|
||||
v1 : c : {1}
|
||||
v2 : d : {2}
|
||||
m : (b[1, 2] -o a) : {3}
|
||||
>>> for cp in GlueFormula('m', '((d -o e) -o ((c -o b) -o a))').compile(Counter()): print(cp)
|
||||
v1 : d : {1}
|
||||
v2 : c : {2}
|
||||
m : (e[1] -o (b[2] -o a)) : {3}
|
||||
>>> for cp in GlueFormula('m', '(((d -o c) -o b) -o a)').compile(Counter()): print(cp)
|
||||
v1 : (d -o c) : {1}
|
||||
m : (b[1] -o a) : {2}
|
||||
>>> for cp in GlueFormula('m', '((((e -o d) -o c) -o b) -o a)').compile(Counter()): print(cp)
|
||||
v1 : e : {1}
|
||||
v2 : (d[1] -o c) : {2}
|
||||
m : (b[2] -o a) : {3}
|
||||
|
||||
|
||||
Demo of 'a man walks' using Compilation
|
||||
---------------------------------------
|
||||
|
||||
Premises
|
||||
|
||||
>>> a = GlueFormula('\\P Q.some x.(P(x) and Q(x))', '((gv -o gr) -o ((g -o G) -o G))')
|
||||
>>> print(a)
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G))
|
||||
|
||||
>>> man = GlueFormula('\\x.man(x)', '(gv -o gr)')
|
||||
>>> print(man)
|
||||
\x.man(x) : (gv -o gr)
|
||||
|
||||
>>> walks = GlueFormula('\\x.walks(x)', '(g -o f)')
|
||||
>>> print(walks)
|
||||
\x.walks(x) : (g -o f)
|
||||
|
||||
Compiled Premises:
|
||||
|
||||
>>> counter = Counter()
|
||||
>>> ahc = a.compile(counter)
|
||||
>>> g1 = ahc[0]
|
||||
>>> print(g1)
|
||||
v1 : gv : {1}
|
||||
>>> g2 = ahc[1]
|
||||
>>> print(g2)
|
||||
v2 : g : {2}
|
||||
>>> g3 = ahc[2]
|
||||
>>> print(g3)
|
||||
\P Q.exists x.(P(x) & Q(x)) : (gr[1] -o (G[2] -o G)) : {3}
|
||||
>>> g4 = man.compile(counter)[0]
|
||||
>>> print(g4)
|
||||
\x.man(x) : (gv -o gr) : {4}
|
||||
>>> g5 = walks.compile(counter)[0]
|
||||
>>> print(g5)
|
||||
\x.walks(x) : (g -o f) : {5}
|
||||
|
||||
Derivation:
|
||||
|
||||
>>> g14 = g4.applyto(g1)
|
||||
>>> print(g14.simplify())
|
||||
man(v1) : gr : {1, 4}
|
||||
>>> g134 = g3.applyto(g14)
|
||||
>>> print(g134.simplify())
|
||||
\Q.exists x.(man(x) & Q(x)) : (G[2] -o G) : {1, 3, 4}
|
||||
>>> g25 = g5.applyto(g2)
|
||||
>>> print(g25.simplify())
|
||||
walks(v2) : f : {2, 5}
|
||||
>>> g12345 = g134.applyto(g25)
|
||||
>>> print(g12345.simplify())
|
||||
exists x.(man(x) & walks(x)) : f : {1, 2, 3, 4, 5}
|
||||
|
||||
---------------------------------
|
||||
Dependency Graph to Glue Formulas
|
||||
---------------------------------
|
||||
>>> from nltk.corpus.reader.dependency import DependencyGraph
|
||||
|
||||
>>> depgraph = DependencyGraph("""1 John _ NNP NNP _ 2 SUBJ _ _
|
||||
... 2 sees _ VB VB _ 0 ROOT _ _
|
||||
... 3 a _ ex_quant ex_quant _ 4 SPEC _ _
|
||||
... 4 dog _ NN NN _ 2 OBJ _ _
|
||||
... """)
|
||||
>>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph)
|
||||
>>> print(gfl) # doctest: +SKIP
|
||||
[\x y.sees(x,y) : (f -o (i -o g)),
|
||||
\x.dog(x) : (iv -o ir),
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I3) -o I3)),
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F4) -o F4)),
|
||||
\x.John(x) : (fv -o fr)]
|
||||
>>> glue = Glue()
|
||||
>>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str):
|
||||
... print(r)
|
||||
exists z1.(John(z1) & exists z2.(dog(z2) & sees(z1,z2)))
|
||||
exists z1.(dog(z1) & exists z2.(John(z2) & sees(z2,z1)))
|
||||
|
||||
-----------------------------------
|
||||
Dependency Graph to LFG f-structure
|
||||
-----------------------------------
|
||||
>>> from nltk.sem.lfg import FStructure
|
||||
|
||||
>>> fstruct = FStructure.read_depgraph(depgraph)
|
||||
|
||||
>>> print(fstruct) # doctest: +SKIP
|
||||
f:[pred 'sees'
|
||||
obj h:[pred 'dog'
|
||||
spec 'a']
|
||||
subj g:[pred 'John']]
|
||||
|
||||
>>> fstruct.to_depgraph().tree().pprint()
|
||||
(sees (dog a) John)
|
||||
|
||||
---------------------------------
|
||||
LFG f-structure to Glue
|
||||
---------------------------------
|
||||
>>> fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')) # doctest: +SKIP
|
||||
[\x y.sees(x,y) : (i -o (g -o f)),
|
||||
\x.dog(x) : (gv -o gr),
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)),
|
||||
\P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)),
|
||||
\x.John(x) : (iv -o ir)]
|
||||
|
||||
.. see gluesemantics_malt.doctest for more
|
||||
@@ -0,0 +1,69 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
.. see also: gluesemantics.doctest
|
||||
|
||||
==============================================================================
|
||||
Glue Semantics
|
||||
==============================================================================
|
||||
|
||||
>>> from nltk.test.gluesemantics_malt_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
>>> from nltk.sem.glue import *
|
||||
>>> nltk.sem.logic._counter._value = 0
|
||||
|
||||
--------------------------------
|
||||
Initialize the Dependency Parser
|
||||
--------------------------------
|
||||
>>> from nltk.parse.malt import MaltParser
|
||||
|
||||
>>> tagger = RegexpTagger(
|
||||
... [('^(John|Mary)$', 'NNP'),
|
||||
... ('^(sees|chases)$', 'VB'),
|
||||
... ('^(a)$', 'ex_quant'),
|
||||
... ('^(every)$', 'univ_quant'),
|
||||
... ('^(girl|dog)$', 'NN')
|
||||
... ]).tag
|
||||
>>> depparser = MaltParser(tagger=tagger)
|
||||
|
||||
--------------------
|
||||
Automated Derivation
|
||||
--------------------
|
||||
>>> glue = Glue(depparser=depparser)
|
||||
>>> readings = glue.parse_to_meaning('every girl chases a dog'.split())
|
||||
>>> for reading in sorted([r.simplify().normalize() for r in readings], key=str):
|
||||
... print(reading.normalize())
|
||||
all z1.(girl(z1) -> exists z2.(dog(z2) & chases(z1,z2)))
|
||||
exists z1.(dog(z1) & all z2.(girl(z2) -> chases(z2,z1)))
|
||||
|
||||
>>> drtglue = DrtGlue(depparser=depparser)
|
||||
>>> readings = drtglue.parse_to_meaning('every girl chases a dog'.split())
|
||||
>>> for reading in sorted([r.simplify().normalize() for r in readings], key=str):
|
||||
... print(reading)
|
||||
([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chases(z1,z2)]))])
|
||||
([z1],[dog(z1), (([z2],[girl(z2)]) -> ([],[chases(z2,z1)]))])
|
||||
|
||||
--------------
|
||||
With inference
|
||||
--------------
|
||||
|
||||
Checking for equality of two DRSs is very useful when generating readings of a sentence.
|
||||
For example, the ``glue`` module generates two readings for the sentence
|
||||
*John sees Mary*:
|
||||
|
||||
>>> from nltk.sem.glue import DrtGlue
|
||||
>>> readings = drtglue.parse_to_meaning('John sees Mary'.split())
|
||||
>>> for drs in sorted([r.simplify().normalize() for r in readings], key=str):
|
||||
... print(drs)
|
||||
([z1,z2],[John(z1), Mary(z2), sees(z1,z2)])
|
||||
([z1,z2],[Mary(z1), John(z2), sees(z2,z1)])
|
||||
|
||||
However, it is easy to tell that these two readings are logically the
|
||||
same, and therefore one of them is superfluous. We can use the theorem prover
|
||||
to determine this equivalence, and then delete one of them. A particular
|
||||
theorem prover may be specified, or the argument may be left off to use the
|
||||
default.
|
||||
|
||||
>>> readings[0].equiv(readings[1])
|
||||
True
|
||||
@@ -0,0 +1,9 @@
|
||||
def setup_module():
|
||||
import pytest
|
||||
|
||||
from nltk.parse.malt import MaltParser
|
||||
|
||||
try:
|
||||
depparser = MaltParser()
|
||||
except (AssertionError, LookupError) as e:
|
||||
pytest.skip("MaltParser is not available")
|
||||
@@ -0,0 +1,88 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===============
|
||||
Grammar Parsing
|
||||
===============
|
||||
|
||||
Grammars can be parsed from strings:
|
||||
|
||||
>>> from nltk import CFG
|
||||
>>> grammar = CFG.fromstring("""
|
||||
... S -> NP VP
|
||||
... PP -> P NP
|
||||
... NP -> Det N | NP PP
|
||||
... VP -> V NP | VP PP
|
||||
... Det -> 'a' | 'the'
|
||||
... N -> 'dog' | 'cat'
|
||||
... V -> 'chased' | 'sat'
|
||||
... P -> 'on' | 'in'
|
||||
... """)
|
||||
>>> grammar
|
||||
<Grammar with 14 productions>
|
||||
>>> grammar.start()
|
||||
S
|
||||
>>> grammar.productions()
|
||||
[S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP,
|
||||
Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat',
|
||||
P -> 'on', P -> 'in']
|
||||
|
||||
Probabilistic CFGs:
|
||||
|
||||
>>> from nltk import PCFG
|
||||
>>> toy_pcfg1 = PCFG.fromstring("""
|
||||
... S -> NP VP [1.0]
|
||||
... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
||||
... Det -> 'the' [0.8] | 'my' [0.2]
|
||||
... N -> 'man' [0.5] | 'telescope' [0.5]
|
||||
... VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
||||
... V -> 'ate' [0.35] | 'saw' [0.65]
|
||||
... PP -> P NP [1.0]
|
||||
... P -> 'with' [0.61] | 'under' [0.39]
|
||||
... """)
|
||||
|
||||
Chomsky Normal Form grammar (Test for bug 474)
|
||||
|
||||
>>> g = CFG.fromstring("VP^<TOP> -> VBP NP^<VP-TOP>")
|
||||
>>> g.productions()[0].lhs()
|
||||
VP^<TOP>
|
||||
|
||||
Grammars can contain both empty strings and empty productions:
|
||||
|
||||
>>> from nltk.grammar import CFG
|
||||
>>> from nltk.parse.generate import generate
|
||||
>>> grammar = CFG.fromstring("""
|
||||
... S -> A B
|
||||
... A -> 'a'
|
||||
... # An empty string:
|
||||
... B -> 'b' | ''
|
||||
... """)
|
||||
>>> list(generate(grammar))
|
||||
[['a', 'b'], ['a', '']]
|
||||
>>> grammar = CFG.fromstring("""
|
||||
... S -> A B
|
||||
... A -> 'a'
|
||||
... # An empty production:
|
||||
... B -> 'b' |
|
||||
... """)
|
||||
>>> list(generate(grammar))
|
||||
[['a', 'b'], ['a']]
|
||||
|
||||
Grammars with mixed rules can be converted into Chomsky Normal Form:
|
||||
|
||||
>>> from nltk import CFG
|
||||
>>> grammar = CFG.fromstring("""
|
||||
... S -> NP VP
|
||||
... PP -> P NP
|
||||
... NP -> NP PP P
|
||||
... NP -> 'the' Nom | 'a' Nom
|
||||
... VP -> V NP | VP PP
|
||||
... Det -> 'a' | 'the'
|
||||
... Nom -> 'dog' | 'cat'
|
||||
... V -> 'chased' | 'sat'
|
||||
... P -> 'on' | 'in'
|
||||
... """)
|
||||
>>> grammar
|
||||
<Grammar with 15 productions>
|
||||
>>> grammar.chomsky_normal_form()
|
||||
<Grammar with 18 productions>
|
||||
@@ -0,0 +1,109 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==========================
|
||||
Test Suites for Grammars
|
||||
==========================
|
||||
|
||||
Sentences in the test suite are divided into two classes:
|
||||
|
||||
- grammatical (*accept*) and
|
||||
- ungrammatical (*reject*).
|
||||
|
||||
If a sentence should parse according to the grammar, the value of
|
||||
``trees`` will be a non-empty list. If a sentence should be rejected
|
||||
according to the grammar, then the value of ``trees`` will be ``None``.
|
||||
|
||||
>>> from nltk.parse import TestGrammar
|
||||
>>> germantest1 = {}
|
||||
>>> germantest1['doc'] = "Tests for person agreement"
|
||||
>>> germantest1['accept'] = [
|
||||
... 'ich komme',
|
||||
... 'ich sehe mich',
|
||||
... 'du kommst',
|
||||
... 'du siehst mich',
|
||||
... 'sie kommt',
|
||||
... 'sie sieht mich',
|
||||
... 'ihr kommt',
|
||||
... 'wir kommen',
|
||||
... 'sie kommen',
|
||||
... 'du magst mich',
|
||||
... 'er mag mich',
|
||||
... 'du folgst mir',
|
||||
... 'sie hilft mir',
|
||||
... ]
|
||||
>>> germantest1['reject'] = [
|
||||
... 'ich kommt',
|
||||
... 'ich kommst',
|
||||
... 'ich siehst mich',
|
||||
... 'du komme',
|
||||
... 'du sehe mich',
|
||||
... 'du kommt',
|
||||
... 'er komme',
|
||||
... 'er siehst mich',
|
||||
... 'wir komme',
|
||||
... 'wir kommst',
|
||||
... 'die Katzen kommst',
|
||||
... 'sie komme',
|
||||
... 'sie kommst',
|
||||
... 'du mag mich',
|
||||
... 'er magst mich',
|
||||
... 'du folgt mir',
|
||||
... 'sie hilfst mir',
|
||||
... ]
|
||||
>>> germantest2 = {}
|
||||
>>> germantest2['doc'] = "Tests for number agreement"
|
||||
>>> germantest2['accept'] = [
|
||||
... 'der Hund kommt',
|
||||
... 'die Hunde kommen',
|
||||
... 'ich komme',
|
||||
... 'wir kommen',
|
||||
... 'ich sehe die Katzen',
|
||||
... 'ich folge den Katzen',
|
||||
... 'ich sehe die Katzen',
|
||||
... 'ich folge den Katzen',
|
||||
... 'wir sehen die Katzen',
|
||||
... 'wir folgen den Katzen'
|
||||
... ]
|
||||
>>> germantest2['reject'] = [
|
||||
... 'ich kommen',
|
||||
... 'wir komme',
|
||||
... 'der Hunde kommt',
|
||||
... 'der Hunde kommen',
|
||||
... 'die Katzen kommt',
|
||||
... 'ich sehe der Hunde',
|
||||
... 'ich folge den Hund',
|
||||
... 'ich sehen der Hunde',
|
||||
... 'ich folgen den Hund',
|
||||
... 'wir sehe die Katzen',
|
||||
... 'wir folge den Katzen'
|
||||
... ]
|
||||
>>> germantest3 = {}
|
||||
>>> germantest3['doc'] = "Tests for case government and subcategorization"
|
||||
>>> germantest3['accept'] = [
|
||||
... 'der Hund sieht mich',
|
||||
... 'der Hund kommt',
|
||||
... 'ich sehe den Hund',
|
||||
... 'ich helfe dem Hund',
|
||||
... ]
|
||||
>>> germantest3['reject'] = [
|
||||
... 'ich sehe',
|
||||
... 'ich helfe',
|
||||
... 'ich komme den Hund',
|
||||
... 'ich sehe den Hund die Katzen',
|
||||
... 'du hilfst mich',
|
||||
... 'du siehst mir',
|
||||
... 'du siehst ich',
|
||||
... 'der Hunde kommt mich',
|
||||
... 'die Hunde sehe die Hunde',
|
||||
... 'der Hund sehe die Hunde',
|
||||
... 'ich hilft den Hund',
|
||||
... 'ich hilft der Hund',
|
||||
... 'ich sehe dem Hund',
|
||||
... ]
|
||||
>>> germantestsuites = [germantest1, germantest2, germantest3]
|
||||
>>> tester = TestGrammar('grammars/book_grammars/german.fcfg', germantestsuites)
|
||||
>>> tester.run()
|
||||
Tests for person agreement: All tests passed!
|
||||
Tests for number agreement: All tests passed!
|
||||
Tests for case government and subcategorization: All tests passed!
|
||||
@@ -0,0 +1,100 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
.. _align howto: align.html
|
||||
.. _ccg howto: ccg.html
|
||||
.. _chat80 howto: chat80.html
|
||||
.. _childes howto: childes.html
|
||||
.. _chunk howto: chunk.html
|
||||
.. _classify howto: classify.html
|
||||
.. _collocations howto: collocations.html
|
||||
.. _compat howto: compat.html
|
||||
.. _corpus howto: corpus.html
|
||||
.. _data howto: data.html
|
||||
.. _dependency howto: dependency.html
|
||||
.. _discourse howto: discourse.html
|
||||
.. _drt howto: drt.html
|
||||
.. _featgram howto: featgram.html
|
||||
.. _featstruct howto: featstruct.html
|
||||
.. _framenet howto: framenet.html
|
||||
.. _generate howto: generate.html
|
||||
.. _gluesemantics howto: gluesemantics.html
|
||||
.. _gluesemantics_malt howto: gluesemantics_malt.html
|
||||
.. _grammar howto: grammar.html
|
||||
.. _grammartestsuites howto: grammartestsuites.html
|
||||
.. _index howto: index.html
|
||||
.. _inference howto: inference.html
|
||||
.. _internals howto: internals.html
|
||||
.. _japanese howto: japanese.html
|
||||
.. _logic howto: logic.html
|
||||
.. _metrics howto: metrics.html
|
||||
.. _misc howto: misc.html
|
||||
.. _nonmonotonic howto: nonmonotonic.html
|
||||
.. _parse howto: parse.html
|
||||
.. _portuguese_en howto: portuguese_en.html
|
||||
.. _probability howto: probability.html
|
||||
.. _propbank howto: propbank.html
|
||||
.. _relextract howto: relextract.html
|
||||
.. _resolution howto: resolution.html
|
||||
.. _semantics howto: semantics.html
|
||||
.. _simple howto: simple.html
|
||||
.. _stem howto: stem.html
|
||||
.. _tag howto: tag.html
|
||||
.. _tokenize howto: tokenize.html
|
||||
.. _toolbox howto: toolbox.html
|
||||
.. _tree howto: tree.html
|
||||
.. _treetransforms howto: treetransforms.html
|
||||
.. _util howto: util.html
|
||||
.. _wordnet howto: wordnet.html
|
||||
.. _wordnet_lch howto: wordnet_lch.html
|
||||
|
||||
===========
|
||||
NLTK HOWTOs
|
||||
===========
|
||||
|
||||
* `align HOWTO`_
|
||||
* `ccg HOWTO`_
|
||||
* `chat80 HOWTO`_
|
||||
* `childes HOWTO`_
|
||||
* `chunk HOWTO`_
|
||||
* `classify HOWTO`_
|
||||
* `collocations HOWTO`_
|
||||
* `compat HOWTO`_
|
||||
* `corpus HOWTO`_
|
||||
* `data HOWTO`_
|
||||
* `dependency HOWTO`_
|
||||
* `discourse HOWTO`_
|
||||
* `drt HOWTO`_
|
||||
* `featgram HOWTO`_
|
||||
* `featstruct HOWTO`_
|
||||
* `framenet HOWTO`_
|
||||
* `generate HOWTO`_
|
||||
* `gluesemantics HOWTO`_
|
||||
* `gluesemantics_malt HOWTO`_
|
||||
* `grammar HOWTO`_
|
||||
* `grammartestsuites HOWTO`_
|
||||
* `index HOWTO`_
|
||||
* `inference HOWTO`_
|
||||
* `internals HOWTO`_
|
||||
* `japanese HOWTO`_
|
||||
* `logic HOWTO`_
|
||||
* `metrics HOWTO`_
|
||||
* `misc HOWTO`_
|
||||
* `nonmonotonic HOWTO`_
|
||||
* `parse HOWTO`_
|
||||
* `portuguese_en HOWTO`_
|
||||
* `probability HOWTO`_
|
||||
* `propbank HOWTO`_
|
||||
* `relextract HOWTO`_
|
||||
* `resolution HOWTO`_
|
||||
* `semantics HOWTO`_
|
||||
* `simple HOWTO`_
|
||||
* `stem HOWTO`_
|
||||
* `tag HOWTO`_
|
||||
* `tokenize HOWTO`_
|
||||
* `toolbox HOWTO`_
|
||||
* `tree HOWTO`_
|
||||
* `treetransforms HOWTO`_
|
||||
* `util HOWTO`_
|
||||
* `wordnet HOWTO`_
|
||||
* `wordnet_lch HOWTO`_
|
||||
@@ -0,0 +1,536 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
====================================
|
||||
Logical Inference and Model Building
|
||||
====================================
|
||||
|
||||
>>> from nltk.test.setup_fixt import check_binary
|
||||
>>> check_binary('mace4')
|
||||
|
||||
>>> from nltk import *
|
||||
>>> from nltk.sem.drt import DrtParser
|
||||
>>> from nltk.sem import logic
|
||||
>>> logic._counter._value = 0
|
||||
|
||||
------------
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Within the area of automated reasoning, first order theorem proving
|
||||
and model building (or model generation) have both received much
|
||||
attention, and have given rise to highly sophisticated techniques. We
|
||||
focus therefore on providing an NLTK interface to third party tools
|
||||
for these tasks. In particular, the module ``nltk.inference`` can be
|
||||
used to access both theorem provers and model builders.
|
||||
|
||||
---------------------------------
|
||||
NLTK Interface to Theorem Provers
|
||||
---------------------------------
|
||||
|
||||
The main class used to interface with a theorem prover is the ``Prover``
|
||||
class, found in ``nltk.api``. The ``prove()`` method takes three optional
|
||||
arguments: a goal, a list of assumptions, and a ``verbose`` boolean to
|
||||
indicate whether the proof should be printed to the console. The proof goal
|
||||
and any assumptions need to be instances of the ``Expression`` class
|
||||
specified by ``nltk.sem.logic``. There are currently three theorem provers
|
||||
included with NLTK: ``Prover9``, ``TableauProver``, and
|
||||
``ResolutionProver``. The first is an off-the-shelf prover, while the other
|
||||
two are written in Python and included in the ``nltk.inference`` package.
|
||||
|
||||
>>> from nltk.sem import Expression
|
||||
>>> read_expr = Expression.fromstring
|
||||
>>> p1 = read_expr('man(socrates)')
|
||||
>>> p2 = read_expr('all x.(man(x) -> mortal(x))')
|
||||
>>> c = read_expr('mortal(socrates)')
|
||||
>>> Prover9().prove(c, [p1,p2])
|
||||
True
|
||||
>>> TableauProver().prove(c, [p1,p2])
|
||||
True
|
||||
>>> ResolutionProver().prove(c, [p1,p2], verbose=True)
|
||||
[1] {-mortal(socrates)} A
|
||||
[2] {man(socrates)} A
|
||||
[3] {-man(z2), mortal(z2)} A
|
||||
[4] {-man(socrates)} (1, 3)
|
||||
[5] {mortal(socrates)} (2, 3)
|
||||
[6] {} (1, 5)
|
||||
<BLANKLINE>
|
||||
True
|
||||
|
||||
---------------------
|
||||
The ``ProverCommand``
|
||||
---------------------
|
||||
|
||||
A ``ProverCommand`` is a stateful holder for a theorem
|
||||
prover. The command stores a theorem prover instance (of type ``Prover``),
|
||||
a goal, a list of assumptions, the result of the proof, and a string version
|
||||
of the entire proof. Corresponding to the three included ``Prover``
|
||||
implementations, there are three ``ProverCommand`` implementations:
|
||||
``Prover9Command``, ``TableauProverCommand``, and
|
||||
``ResolutionProverCommand``.
|
||||
|
||||
The ``ProverCommand``'s constructor takes its goal and assumptions. The
|
||||
``prove()`` command executes the ``Prover`` and ``proof()``
|
||||
returns a String form of the proof
|
||||
If the ``prove()`` method has not been called,
|
||||
then the prover command will be unable to display a proof.
|
||||
|
||||
>>> prover = ResolutionProverCommand(c, [p1,p2])
|
||||
>>> print(prover.proof())
|
||||
Traceback (most recent call last):
|
||||
File "...", line 1212, in __run
|
||||
compileflags, 1) in test.globs
|
||||
File "<doctest nltk/test/inference.doctest[10]>", line 1, in <module>
|
||||
File "...", line ..., in proof
|
||||
raise LookupError("You have to call prove() first to get a proof!")
|
||||
LookupError: You have to call prove() first to get a proof!
|
||||
>>> prover.prove()
|
||||
True
|
||||
>>> print(prover.proof())
|
||||
[1] {-mortal(socrates)} A
|
||||
[2] {man(socrates)} A
|
||||
[3] {-man(z4), mortal(z4)} A
|
||||
[4] {-man(socrates)} (1, 3)
|
||||
[5] {mortal(socrates)} (2, 3)
|
||||
[6] {} (1, 5)
|
||||
<BLANKLINE>
|
||||
|
||||
The prover command stores the result of proving so that if ``prove()`` is
|
||||
called again, then the command can return the result without executing the
|
||||
prover again. This allows the user to access the result of the proof without
|
||||
wasting time re-computing what it already knows.
|
||||
|
||||
>>> prover.prove()
|
||||
True
|
||||
>>> prover.prove()
|
||||
True
|
||||
|
||||
The assumptions and goal may be accessed using the ``assumptions()`` and
|
||||
``goal()`` methods, respectively.
|
||||
|
||||
>>> prover.assumptions()
|
||||
[<ApplicationExpression man(socrates)>, <AllExpression all x.(man(x) -> mortal(x))>]
|
||||
>>> prover.goal()
|
||||
<ApplicationExpression mortal(socrates)>
|
||||
|
||||
The assumptions list may be modified using the ``add_assumptions()`` and
|
||||
``retract_assumptions()`` methods. Both methods take a list of ``Expression``
|
||||
objects. Since adding or removing assumptions may change the result of the
|
||||
proof, the stored result is cleared when either of these methods are called.
|
||||
That means that ``proof()`` will be unavailable until ``prove()`` is called and
|
||||
a call to ``prove()`` will execute the theorem prover.
|
||||
|
||||
>>> prover.retract_assumptions([read_expr('man(socrates)')])
|
||||
>>> print(prover.proof())
|
||||
Traceback (most recent call last):
|
||||
File "...", line 1212, in __run
|
||||
compileflags, 1) in test.globs
|
||||
File "<doctest nltk/test/inference.doctest[10]>", line 1, in <module>
|
||||
File "...", line ..., in proof
|
||||
raise LookupError("You have to call prove() first to get a proof!")
|
||||
LookupError: You have to call prove() first to get a proof!
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> print(prover.proof())
|
||||
[1] {-mortal(socrates)} A
|
||||
[2] {-man(z6), mortal(z6)} A
|
||||
[3] {-man(socrates)} (1, 2)
|
||||
<BLANKLINE>
|
||||
>>> prover.add_assumptions([read_expr('man(socrates)')])
|
||||
>>> prover.prove()
|
||||
True
|
||||
|
||||
-------
|
||||
Prover9
|
||||
-------
|
||||
|
||||
Prover9 Installation
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can download Prover9 from https://www.cs.unm.edu/~mccune/prover9/.
|
||||
|
||||
Extract the source code into a suitable directory and follow the
|
||||
instructions in the Prover9 ``README.make`` file to compile the executables.
|
||||
Install these into an appropriate location; the
|
||||
``prover9_search`` variable is currently configured to look in the
|
||||
following locations:
|
||||
|
||||
>>> p = Prover9()
|
||||
>>> p.binary_locations()
|
||||
['/usr/local/bin/prover9',
|
||||
'/usr/local/bin/prover9/bin',
|
||||
'/usr/local/bin',
|
||||
'/usr/bin',
|
||||
'/usr/local/prover9',
|
||||
'/usr/local/share/prover9']
|
||||
|
||||
Alternatively, the environment variable ``PROVER9HOME`` may be configured with
|
||||
the binary's location.
|
||||
|
||||
The path to the correct directory can be set manually in the following
|
||||
manner:
|
||||
|
||||
>>> config_prover9(path='/usr/local/bin') # doctest: +SKIP
|
||||
[Found prover9: /usr/local/bin/prover9]
|
||||
|
||||
If the executables cannot be found, ``Prover9`` will issue a warning message:
|
||||
|
||||
>>> p.prove() # doctest: +SKIP
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
LookupError:
|
||||
===========================================================================
|
||||
NLTK was unable to find the prover9 executable! Use config_prover9() or
|
||||
set the PROVER9HOME environment variable.
|
||||
<BLANKLINE>
|
||||
>> config_prover9('/path/to/prover9')
|
||||
<BLANKLINE>
|
||||
For more information, on prover9, see:
|
||||
<https://www.cs.unm.edu/~mccune/prover9/>
|
||||
===========================================================================
|
||||
|
||||
|
||||
Using Prover9
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
The general case in theorem proving is to determine whether ``S |- g``
|
||||
holds, where ``S`` is a possibly empty set of assumptions, and ``g``
|
||||
is a proof goal.
|
||||
|
||||
As mentioned earlier, NLTK input to ``Prover9`` must be
|
||||
``Expression``\ s of ``nltk.sem.logic``. A ``Prover9`` instance is
|
||||
initialized with a proof goal and, possibly, some assumptions. The
|
||||
``prove()`` method attempts to find a proof of the goal, given the
|
||||
list of assumptions (in this case, none).
|
||||
|
||||
>>> goal = read_expr('(man(x) <-> --man(x))')
|
||||
>>> prover = Prover9Command(goal)
|
||||
>>> prover.prove()
|
||||
True
|
||||
|
||||
Given a ``ProverCommand`` instance ``prover``, the method
|
||||
``prover.proof()`` will return a String of the extensive proof information
|
||||
provided by Prover9, shown in abbreviated form here::
|
||||
|
||||
============================== Prover9 ===============================
|
||||
Prover9 (32) version ...
|
||||
Process ... was started by ... on ...
|
||||
...
|
||||
The command was ".../prover9 -f ...".
|
||||
============================== end of head ===========================
|
||||
|
||||
============================== INPUT =================================
|
||||
|
||||
% Reading from file /var/...
|
||||
|
||||
|
||||
formulas(goals).
|
||||
(all x (man(x) -> man(x))).
|
||||
end_of_list.
|
||||
|
||||
...
|
||||
============================== end of search =========================
|
||||
|
||||
THEOREM PROVED
|
||||
|
||||
Exiting with 1 proof.
|
||||
|
||||
Process 6317 exit (max_proofs) Mon Jan 21 15:23:28 2008
|
||||
|
||||
|
||||
As mentioned earlier, we may want to list some assumptions for
|
||||
the proof, as shown here.
|
||||
|
||||
>>> g = read_expr('mortal(socrates)')
|
||||
>>> a1 = read_expr('all x.(man(x) -> mortal(x))')
|
||||
>>> prover = Prover9Command(g, assumptions=[a1])
|
||||
>>> prover.print_assumptions()
|
||||
all x.(man(x) -> mortal(x))
|
||||
|
||||
However, the assumptions are not sufficient to derive the goal:
|
||||
|
||||
>>> print(prover.prove())
|
||||
False
|
||||
|
||||
So let's add another assumption:
|
||||
|
||||
>>> a2 = read_expr('man(socrates)')
|
||||
>>> prover.add_assumptions([a2])
|
||||
>>> prover.print_assumptions()
|
||||
all x.(man(x) -> mortal(x))
|
||||
man(socrates)
|
||||
>>> print(prover.prove())
|
||||
True
|
||||
|
||||
We can also show the assumptions in ``Prover9`` format.
|
||||
|
||||
>>> prover.print_assumptions(output_format='Prover9')
|
||||
all x (man(x) -> mortal(x))
|
||||
man(socrates)
|
||||
|
||||
>>> prover.print_assumptions(output_format='Spass')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NameError: Unrecognized value for 'output_format': Spass
|
||||
|
||||
Assumptions can be retracted from the list of assumptions.
|
||||
|
||||
>>> prover.retract_assumptions([a1])
|
||||
>>> prover.print_assumptions()
|
||||
man(socrates)
|
||||
>>> prover.retract_assumptions([a1])
|
||||
|
||||
Statements can be loaded from a file and parsed. We can then add these
|
||||
statements as new assumptions.
|
||||
|
||||
>>> g = read_expr('all x.(boxer(x) -> -boxerdog(x))')
|
||||
>>> prover = Prover9Command(g)
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> import nltk.data
|
||||
>>> new = nltk.data.load('grammars/sample_grammars/background0.fol')
|
||||
>>> for a in new:
|
||||
... print(a)
|
||||
all x.(boxerdog(x) -> dog(x))
|
||||
all x.(boxer(x) -> person(x))
|
||||
all x.-(dog(x) & person(x))
|
||||
exists x.boxer(x)
|
||||
exists x.boxerdog(x)
|
||||
>>> prover.add_assumptions(new)
|
||||
>>> print(prover.prove())
|
||||
True
|
||||
>>> print(prover.proof())
|
||||
============================== prooftrans ============================
|
||||
Prover9 (...) version ...
|
||||
Process ... was started by ... on ...
|
||||
...
|
||||
The command was ".../prover9".
|
||||
============================== end of head ===========================
|
||||
<BLANKLINE>
|
||||
============================== end of input ==========================
|
||||
<BLANKLINE>
|
||||
============================== PROOF =================================
|
||||
<BLANKLINE>
|
||||
% -------- Comments from original proof --------
|
||||
% Proof 1 at ... seconds.
|
||||
% Length of proof is 13.
|
||||
% Level of proof is 4.
|
||||
% Maximum clause weight is 0.
|
||||
% Given clauses 0.
|
||||
<BLANKLINE>
|
||||
1 (all x (boxerdog(x) -> dog(x))). [assumption].
|
||||
2 (all x (boxer(x) -> person(x))). [assumption].
|
||||
3 (all x -(dog(x) & person(x))). [assumption].
|
||||
6 (all x (boxer(x) -> -boxerdog(x))). [goal].
|
||||
8 -boxerdog(x) | dog(x). [clausify(1)].
|
||||
9 boxerdog(c3). [deny(6)].
|
||||
11 -boxer(x) | person(x). [clausify(2)].
|
||||
12 boxer(c3). [deny(6)].
|
||||
14 -dog(x) | -person(x). [clausify(3)].
|
||||
15 dog(c3). [resolve(9,a,8,a)].
|
||||
18 person(c3). [resolve(12,a,11,a)].
|
||||
19 -person(c3). [resolve(15,a,14,a)].
|
||||
20 $F. [resolve(19,a,18,a)].
|
||||
<BLANKLINE>
|
||||
============================== end of proof ==========================
|
||||
|
||||
----------------------
|
||||
The equiv() method
|
||||
----------------------
|
||||
|
||||
One application of the theorem prover functionality is to check if
|
||||
two Expressions have the same meaning.
|
||||
The ``equiv()`` method calls a theorem prover to determine whether two
|
||||
Expressions are logically equivalent.
|
||||
|
||||
>>> a = read_expr(r'exists x.(man(x) & walks(x))')
|
||||
>>> b = read_expr(r'exists x.(walks(x) & man(x))')
|
||||
>>> print(a.equiv(b))
|
||||
True
|
||||
|
||||
The same method can be used on Discourse Representation Structures (DRSs).
|
||||
In this case, each DRS is converted to a first order logic form, and then
|
||||
passed to the theorem prover.
|
||||
|
||||
>>> dp = DrtParser()
|
||||
>>> a = dp.parse(r'([x],[man(x), walks(x)])')
|
||||
>>> b = dp.parse(r'([x],[walks(x), man(x)])')
|
||||
>>> print(a.equiv(b))
|
||||
True
|
||||
|
||||
|
||||
--------------------------------
|
||||
NLTK Interface to Model Builders
|
||||
--------------------------------
|
||||
|
||||
The top-level to model builders is parallel to that for
|
||||
theorem-provers. The ``ModelBuilder`` interface is located
|
||||
in ``nltk.inference.api``. It is currently only implemented by
|
||||
``Mace``, which interfaces with the Mace4 model builder.
|
||||
|
||||
Typically we use a model builder to show that some set of formulas has
|
||||
a model, and is therefore consistent. One way of doing this is by
|
||||
treating our candidate set of sentences as assumptions, and leaving
|
||||
the goal unspecified.
|
||||
Thus, the following interaction shows how both ``{a, c1}`` and ``{a, c2}``
|
||||
are consistent sets, since Mace succeeds in a building a
|
||||
model for each of them, while ``{c1, c2}`` is inconsistent.
|
||||
|
||||
>>> a3 = read_expr('exists x.(man(x) and walks(x))')
|
||||
>>> c1 = read_expr('mortal(socrates)')
|
||||
>>> c2 = read_expr('-mortal(socrates)')
|
||||
>>> mace = Mace()
|
||||
>>> print(mace.build_model(None, [a3, c1]))
|
||||
True
|
||||
>>> print(mace.build_model(None, [a3, c2]))
|
||||
True
|
||||
|
||||
We can also use the model builder as an adjunct to theorem prover.
|
||||
Let's suppose we are trying to prove ``S |- g``, i.e. that ``g``
|
||||
is logically entailed by assumptions ``S = {s1, s2, ..., sn}``.
|
||||
We can this same input to Mace4, and the model builder will try to
|
||||
find a counterexample, that is, to show that ``g`` does *not* follow
|
||||
from ``S``. So, given this input, Mace4 will try to find a model for
|
||||
the set ``S' = {s1, s2, ..., sn, (not g)}``. If ``g`` fails to follow
|
||||
from ``S``, then Mace4 may well return with a counterexample faster
|
||||
than Prover9 concludes that it cannot find the required proof.
|
||||
Conversely, if ``g`` *is* provable from ``S``, Mace4 may take a long
|
||||
time unsuccessfully trying to find a counter model, and will eventually give up.
|
||||
|
||||
In the following example, we see that the model builder does succeed
|
||||
in building a model of the assumptions together with the negation of
|
||||
the goal. That is, it succeeds in finding a model
|
||||
where there is a woman that every man loves; Adam is a man; Eve is a
|
||||
woman; but Adam does not love Eve.
|
||||
|
||||
>>> a4 = read_expr('exists y. (woman(y) & all x. (man(x) -> love(x,y)))')
|
||||
>>> a5 = read_expr('man(adam)')
|
||||
>>> a6 = read_expr('woman(eve)')
|
||||
>>> g = read_expr('love(adam,eve)')
|
||||
>>> print(mace.build_model(g, [a4, a5, a6]))
|
||||
True
|
||||
|
||||
The Model Builder will fail to find a model if the assumptions do entail
|
||||
the goal. Mace will continue to look for models of ever-increasing sizes
|
||||
until the end_size number is reached. By default, end_size is 500,
|
||||
but it can be set manually for quicker response time.
|
||||
|
||||
>>> a7 = read_expr('all x.(man(x) -> mortal(x))')
|
||||
>>> a8 = read_expr('man(socrates)')
|
||||
>>> g2 = read_expr('mortal(socrates)')
|
||||
>>> print(Mace(end_size=50).build_model(g2, [a7, a8]))
|
||||
False
|
||||
|
||||
There is also a ``ModelBuilderCommand`` class that, like ``ProverCommand``,
|
||||
stores a ``ModelBuilder``, a goal, assumptions, a result, and a model. The
|
||||
only implementation in NLTK is ``MaceCommand``.
|
||||
|
||||
|
||||
-----
|
||||
Mace4
|
||||
-----
|
||||
|
||||
Mace4 Installation
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Mace4 is packaged with Prover9, and can be downloaded from the same
|
||||
source, namely https://www.cs.unm.edu/~mccune/prover9/. It is installed
|
||||
in the same manner as Prover9.
|
||||
|
||||
Using Mace4
|
||||
~~~~~~~~~~~
|
||||
|
||||
Check whether Mace4 can find a model.
|
||||
|
||||
>>> a = read_expr('(see(mary,john) & -(mary = john))')
|
||||
>>> mb = MaceCommand(assumptions=[a])
|
||||
>>> mb.build_model()
|
||||
True
|
||||
|
||||
Show the model in 'tabular' format.
|
||||
|
||||
>>> print(mb.model(format='tabular'))
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 2
|
||||
<BLANKLINE>
|
||||
john : 0
|
||||
<BLANKLINE>
|
||||
mary : 1
|
||||
<BLANKLINE>
|
||||
see :
|
||||
| 0 1
|
||||
---+----
|
||||
0 | 0 0
|
||||
1 | 1 0
|
||||
<BLANKLINE>
|
||||
|
||||
Show the model in 'tabular' format.
|
||||
|
||||
>>> print(mb.model(format='cooked'))
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 2
|
||||
<BLANKLINE>
|
||||
john = 0.
|
||||
<BLANKLINE>
|
||||
mary = 1.
|
||||
<BLANKLINE>
|
||||
- see(0,0).
|
||||
- see(0,1).
|
||||
see(1,0).
|
||||
- see(1,1).
|
||||
<BLANKLINE>
|
||||
|
||||
The property ``valuation`` accesses the stored ``Valuation``.
|
||||
|
||||
>>> print(mb.valuation)
|
||||
{'john': 'a', 'mary': 'b', 'see': {('b', 'a')}}
|
||||
|
||||
We can return to our earlier example and inspect the model:
|
||||
|
||||
>>> mb = MaceCommand(g, assumptions=[a4, a5, a6])
|
||||
>>> m = mb.build_model()
|
||||
>>> print(mb.model(format='cooked'))
|
||||
% number = 1
|
||||
% seconds = 0
|
||||
<BLANKLINE>
|
||||
% Interpretation of size 2
|
||||
<BLANKLINE>
|
||||
adam = 0.
|
||||
<BLANKLINE>
|
||||
eve = 0.
|
||||
<BLANKLINE>
|
||||
c1 = 1.
|
||||
<BLANKLINE>
|
||||
man(0).
|
||||
- man(1).
|
||||
<BLANKLINE>
|
||||
woman(0).
|
||||
woman(1).
|
||||
<BLANKLINE>
|
||||
- love(0,0).
|
||||
love(0,1).
|
||||
- love(1,0).
|
||||
- love(1,1).
|
||||
<BLANKLINE>
|
||||
|
||||
Here, we can see that ``adam`` and ``eve`` have been assigned the same
|
||||
individual, namely ``0`` as value; ``0`` is both a man and a woman; a second
|
||||
individual ``1`` is also a woman; and ``0`` loves ``1``. Thus, this is
|
||||
an interpretation in which there is a woman that every man loves but
|
||||
Adam doesn't love Eve.
|
||||
|
||||
Mace can also be used with propositional logic.
|
||||
|
||||
>>> p = read_expr('P')
|
||||
>>> q = read_expr('Q')
|
||||
>>> mb = MaceCommand(q, [p, p>-q])
|
||||
>>> mb.build_model()
|
||||
True
|
||||
>>> mb.valuation['P']
|
||||
True
|
||||
>>> mb.valuation['Q']
|
||||
False
|
||||
@@ -0,0 +1,161 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==========================================
|
||||
Unit tests for the nltk.utilities module
|
||||
==========================================
|
||||
|
||||
overridden()
|
||||
~~~~~~~~~~~~
|
||||
>>> from nltk.internals import overridden
|
||||
|
||||
The typical use case is in defining methods for an interface or
|
||||
abstract base class, in such a way that subclasses don't have to
|
||||
implement all of the methods:
|
||||
|
||||
>>> class EaterI(object):
|
||||
... '''Subclass must define eat() or batch_eat().'''
|
||||
... def eat(self, food):
|
||||
... if overridden(self.batch_eat):
|
||||
... return self.batch_eat([food])[0]
|
||||
... else:
|
||||
... raise NotImplementedError()
|
||||
... def batch_eat(self, foods):
|
||||
... return [self.eat(food) for food in foods]
|
||||
|
||||
As long as a subclass implements one method, it will be used to
|
||||
perform the other method:
|
||||
|
||||
>>> class GoodEater1(EaterI):
|
||||
... def eat(self, food):
|
||||
... return 'yum'
|
||||
>>> GoodEater1().eat('steak')
|
||||
'yum'
|
||||
>>> GoodEater1().batch_eat(['steak', 'peas'])
|
||||
['yum', 'yum']
|
||||
|
||||
>>> class GoodEater2(EaterI):
|
||||
... def batch_eat(self, foods):
|
||||
... return ['yum' for food in foods]
|
||||
>>> GoodEater2().eat('steak')
|
||||
'yum'
|
||||
>>> GoodEater2().batch_eat(['steak', 'peas'])
|
||||
['yum', 'yum']
|
||||
|
||||
But if a subclass doesn't implement either one, then they'll get an
|
||||
error when they try to call them. (nb this is better than infinite
|
||||
recursion):
|
||||
|
||||
>>> class BadEater1(EaterI):
|
||||
... pass
|
||||
>>> BadEater1().eat('steak')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
>>> BadEater1().batch_eat(['steak', 'peas'])
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
|
||||
Trying to use the abstract base class itself will also result in an
|
||||
error:
|
||||
|
||||
>>> class EaterI(EaterI):
|
||||
... pass
|
||||
>>> EaterI().eat('steak')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
>>> EaterI().batch_eat(['steak', 'peas'])
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
|
||||
It's ok to use intermediate abstract classes:
|
||||
|
||||
>>> class AbstractEater(EaterI):
|
||||
... pass
|
||||
|
||||
>>> class GoodEater3(AbstractEater):
|
||||
... def eat(self, food):
|
||||
... return 'yum'
|
||||
...
|
||||
>>> GoodEater3().eat('steak')
|
||||
'yum'
|
||||
>>> GoodEater3().batch_eat(['steak', 'peas'])
|
||||
['yum', 'yum']
|
||||
|
||||
>>> class GoodEater4(AbstractEater):
|
||||
... def batch_eat(self, foods):
|
||||
... return ['yum' for food in foods]
|
||||
>>> GoodEater4().eat('steak')
|
||||
'yum'
|
||||
>>> GoodEater4().batch_eat(['steak', 'peas'])
|
||||
['yum', 'yum']
|
||||
|
||||
>>> class BadEater2(AbstractEater):
|
||||
... pass
|
||||
>>> BadEater2().eat('steak')
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
>>> BadEater2().batch_eat(['steak', 'peas'])
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
NotImplementedError
|
||||
|
||||
Here's some extra tests:
|
||||
|
||||
>>> class A(object):
|
||||
... def f(x): pass
|
||||
>>> class B(A):
|
||||
... def f(x): pass
|
||||
>>> class C(A): pass
|
||||
>>> class D(B): pass
|
||||
|
||||
>>> overridden(A().f)
|
||||
False
|
||||
>>> overridden(B().f)
|
||||
True
|
||||
>>> overridden(C().f)
|
||||
False
|
||||
>>> overridden(D().f)
|
||||
True
|
||||
|
||||
It works for classic classes, too:
|
||||
|
||||
>>> class A:
|
||||
... def f(x): pass
|
||||
>>> class B(A):
|
||||
... def f(x): pass
|
||||
>>> class C(A): pass
|
||||
>>> class D(B): pass
|
||||
>>> overridden(A().f)
|
||||
False
|
||||
>>> overridden(B().f)
|
||||
True
|
||||
>>> overridden(C().f)
|
||||
False
|
||||
>>> overridden(D().f)
|
||||
True
|
||||
|
||||
|
||||
read_str()
|
||||
~~~~~~~~~~~~
|
||||
>>> from nltk.internals import read_str
|
||||
|
||||
Test valid scenarios
|
||||
|
||||
>>> read_str("'valid string'", 0)
|
||||
('valid string', 14)
|
||||
|
||||
Now test invalid scenarios
|
||||
|
||||
>>> read_str("should error", 0)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
nltk.internals.ReadError: Expected open quote at 0
|
||||
>>> read_str("'should error", 0)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
nltk.internals.ReadError: Expected close quote at 1
|
||||
@@ -0,0 +1,48 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
============================
|
||||
Japanese Language Processing
|
||||
============================
|
||||
|
||||
>>> from nltk import *
|
||||
|
||||
-------------
|
||||
Corpus Access
|
||||
-------------
|
||||
|
||||
KNB Corpus
|
||||
----------
|
||||
|
||||
>>> from nltk.corpus import knbc
|
||||
|
||||
Access the words: this should produce a list of strings:
|
||||
|
||||
>>> type(knbc.words()[0]) is not bytes
|
||||
True
|
||||
|
||||
Access the sentences: this should produce a list of lists of strings:
|
||||
|
||||
>>> type(knbc.sents()[0][0]) is not bytes
|
||||
True
|
||||
|
||||
Access the tagged words: this should produce a list of word, tag pairs:
|
||||
|
||||
>>> type(knbc.tagged_words()[0])
|
||||
<... 'tuple'>
|
||||
|
||||
Access the tagged sentences: this should produce a list of lists of word, tag pairs:
|
||||
|
||||
>>> type(knbc.tagged_sents()[0][0])
|
||||
<... 'tuple'>
|
||||
|
||||
|
||||
JEITA Corpus
|
||||
------------
|
||||
|
||||
>>> from nltk.corpus import jeita
|
||||
|
||||
Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string:
|
||||
|
||||
>>> type(jeita.tagged_words()[0][1]) is not bytes
|
||||
True
|
||||
135
Backend/venv/lib/python3.12/site-packages/nltk/test/lm.doctest
Normal file
135
Backend/venv/lib/python3.12/site-packages/nltk/test/lm.doctest
Normal file
@@ -0,0 +1,135 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
.. -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
Regression Tests
|
||||
================
|
||||
|
||||
|
||||
Issue 167
|
||||
---------
|
||||
https://github.com/nltk/nltk/issues/167
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
>>> ngram_order = 3
|
||||
>>> train_data, vocab_data = padded_everygram_pipeline(
|
||||
... ngram_order,
|
||||
... brown.sents(categories="news")
|
||||
... )
|
||||
|
||||
>>> from nltk.lm import WittenBellInterpolated
|
||||
>>> lm = WittenBellInterpolated(ngram_order)
|
||||
>>> lm.fit(train_data, vocab_data)
|
||||
|
||||
|
||||
|
||||
|
||||
Sentence containing an unseen word should result in infinite entropy because
|
||||
Witten-Bell is based ultimately on MLE, which cannot handle unseen ngrams.
|
||||
Crucially, it shouldn't raise any exceptions for unseen words.
|
||||
|
||||
>>> from nltk.util import ngrams
|
||||
>>> sent = ngrams("This is a sentence with the word aaddvark".split(), 3)
|
||||
>>> lm.entropy(sent)
|
||||
inf
|
||||
|
||||
If we remove all unseen ngrams from the sentence, we'll get a non-infinite value
|
||||
for the entropy.
|
||||
|
||||
>>> sent = ngrams("This is a sentence".split(), 3)
|
||||
>>> round(lm.entropy(sent), 14)
|
||||
10.23701322869105
|
||||
|
||||
|
||||
Issue 367
|
||||
---------
|
||||
https://github.com/nltk/nltk/issues/367
|
||||
|
||||
Reproducing Dan Blanchard's example:
|
||||
https://github.com/nltk/nltk/issues/367#issuecomment-14646110
|
||||
|
||||
>>> from nltk.lm import Lidstone, Vocabulary
|
||||
>>> word_seq = list('aaaababaaccbacb')
|
||||
>>> ngram_order = 2
|
||||
>>> from nltk.util import everygrams
|
||||
>>> train_data = [everygrams(word_seq, max_len=ngram_order)]
|
||||
>>> V = Vocabulary(['a', 'b', 'c', ''])
|
||||
>>> lm = Lidstone(0.2, ngram_order, vocabulary=V)
|
||||
>>> lm.fit(train_data)
|
||||
|
||||
For doctest to work we have to sort the vocabulary keys.
|
||||
|
||||
>>> V_keys = sorted(V)
|
||||
>>> round(sum(lm.score(w, ("b",)) for w in V_keys), 6)
|
||||
1.0
|
||||
>>> round(sum(lm.score(w, ("a",)) for w in V_keys), 6)
|
||||
1.0
|
||||
|
||||
>>> [lm.score(w, ("b",)) for w in V_keys]
|
||||
[0.05, 0.05, 0.8, 0.05, 0.05]
|
||||
>>> [round(lm.score(w, ("a",)), 4) for w in V_keys]
|
||||
[0.0222, 0.0222, 0.4667, 0.2444, 0.2444]
|
||||
|
||||
|
||||
Here's reproducing @afourney's comment:
|
||||
https://github.com/nltk/nltk/issues/367#issuecomment-15686289
|
||||
|
||||
>>> sent = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']
|
||||
>>> ngram_order = 3
|
||||
>>> from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
>>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, [sent])
|
||||
>>> from nltk.lm import Lidstone
|
||||
>>> lm = Lidstone(0.2, ngram_order)
|
||||
>>> lm.fit(train_data, vocab_data)
|
||||
|
||||
The vocabulary includes the "UNK" symbol as well as two padding symbols.
|
||||
|
||||
>>> len(lm.vocab)
|
||||
6
|
||||
>>> word = "foo"
|
||||
>>> context = ("bar", "baz")
|
||||
|
||||
The raw counts.
|
||||
|
||||
>>> lm.context_counts(context)[word]
|
||||
0
|
||||
>>> lm.context_counts(context).N()
|
||||
1
|
||||
|
||||
Counts with Lidstone smoothing.
|
||||
|
||||
>>> lm.context_counts(context)[word] + lm.gamma
|
||||
0.2
|
||||
>>> lm.context_counts(context).N() + len(lm.vocab) * lm.gamma
|
||||
2.2
|
||||
|
||||
Without any backoff, just using Lidstone smoothing, P("foo" | "bar", "baz") should be:
|
||||
0.2 / 2.2 ~= 0.090909
|
||||
|
||||
>>> round(lm.score(word, context), 6)
|
||||
0.090909
|
||||
|
||||
|
||||
Issue 380
|
||||
---------
|
||||
https://github.com/nltk/nltk/issues/380
|
||||
|
||||
Reproducing setup akin to this comment:
|
||||
https://github.com/nltk/nltk/issues/380#issue-12879030
|
||||
|
||||
For speed take only the first 100 sentences of reuters. Shouldn't affect the test.
|
||||
|
||||
>>> from nltk.corpus import reuters
|
||||
>>> sents = reuters.sents()[:100]
|
||||
>>> ngram_order = 3
|
||||
>>> from nltk.lm.preprocessing import padded_everygram_pipeline
|
||||
>>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, sents)
|
||||
|
||||
>>> from nltk.lm import Lidstone
|
||||
>>> lm = Lidstone(0.2, ngram_order)
|
||||
>>> lm.fit(train_data, vocab_data)
|
||||
>>> lm.score("said", ("",)) < 1
|
||||
True
|
||||
1096
Backend/venv/lib/python3.12/site-packages/nltk/test/logic.doctest
Normal file
1096
Backend/venv/lib/python3.12/site-packages/nltk/test/logic.doctest
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,54 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
.. -*- coding: utf-8 -*-
|
||||
|
||||
=============
|
||||
METEOR tests
|
||||
=============
|
||||
|
||||
No Alignment test
|
||||
------------------
|
||||
|
||||
>>> from nltk.translate import meteor
|
||||
>>> from nltk import word_tokenize
|
||||
|
||||
If the candidate has no alignment to any of the references, the METEOR score is 0.
|
||||
|
||||
>>> round(meteor(
|
||||
... [word_tokenize('The candidate has no alignment to any of the references')],
|
||||
... word_tokenize('John loves Mary')
|
||||
... ), 4)
|
||||
0.0
|
||||
|
||||
Tests based on wikipedia examples
|
||||
---------------------------------
|
||||
|
||||
Testing on `wikipedia examples <https://en.wikipedia.org/wiki/METEOR#Examples>`_
|
||||
|
||||
>>> same_res = round(meteor(
|
||||
... [word_tokenize('The cat sat on the mat')],
|
||||
... word_tokenize('The cat sat on the mat')
|
||||
... ), 4)
|
||||
>>> abs(same_res - 0.9977) < 1e-2
|
||||
True
|
||||
|
||||
>>> meteor(
|
||||
... [word_tokenize('The cat sat on the mat')],
|
||||
... word_tokenize('on the mat sat the cat')
|
||||
... )
|
||||
0.5
|
||||
|
||||
>>> round(meteor(
|
||||
... [word_tokenize('The cat sat on the mat')],
|
||||
... word_tokenize('The cat was sat on the mat')
|
||||
... ), 4)
|
||||
0.9654
|
||||
|
||||
Test corresponding to issue #2751, where METEOR score > 1
|
||||
|
||||
>>> round(meteor(
|
||||
... [word_tokenize('create or update a vm set')],
|
||||
... word_tokenize('creates or updates a virtual machine scale set')
|
||||
... ), 4)
|
||||
0.7806
|
||||
@@ -0,0 +1,321 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=======
|
||||
Metrics
|
||||
=======
|
||||
|
||||
-----
|
||||
Setup
|
||||
-----
|
||||
|
||||
>>> import pytest
|
||||
>>> _ = pytest.importorskip("numpy")
|
||||
|
||||
|
||||
The `nltk.metrics` package provides a variety of *evaluation measures*
|
||||
which can be used for a wide variety of NLP tasks.
|
||||
|
||||
>>> from nltk.metrics import *
|
||||
|
||||
------------------
|
||||
Standard IR Scores
|
||||
------------------
|
||||
|
||||
We can use standard scores from information retrieval to test the
|
||||
performance of taggers, chunkers, etc.
|
||||
|
||||
>>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
|
||||
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
|
||||
>>> print(accuracy(reference, test))
|
||||
0.8
|
||||
|
||||
|
||||
The following measures apply to sets:
|
||||
|
||||
>>> reference_set = set(reference)
|
||||
>>> test_set = set(test)
|
||||
>>> precision(reference_set, test_set)
|
||||
1.0
|
||||
>>> print(recall(reference_set, test_set))
|
||||
0.8
|
||||
>>> print(f_measure(reference_set, test_set))
|
||||
0.88888888888...
|
||||
|
||||
Measuring the likelihood of the data, given probability distributions:
|
||||
|
||||
>>> from nltk import FreqDist, MLEProbDist
|
||||
>>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf"))
|
||||
>>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss"))
|
||||
>>> print(log_likelihood(['a', 'd'], [pdist1, pdist2]))
|
||||
-2.7075187496...
|
||||
|
||||
|
||||
----------------
|
||||
Distance Metrics
|
||||
----------------
|
||||
|
||||
String edit distance (Levenshtein):
|
||||
|
||||
>>> edit_distance("rain", "shine")
|
||||
3
|
||||
>>> edit_distance_align("shine", "shine")
|
||||
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
|
||||
>>> edit_distance_align("rain", "brainy")
|
||||
[(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)]
|
||||
>>> edit_distance_align("", "brainy")
|
||||
[(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)]
|
||||
>>> edit_distance_align("", "")
|
||||
[(0, 0)]
|
||||
|
||||
Other distance measures:
|
||||
|
||||
>>> s1 = set([1,2,3,4])
|
||||
>>> s2 = set([3,4,5])
|
||||
>>> binary_distance(s1, s2)
|
||||
1.0
|
||||
>>> print(jaccard_distance(s1, s2))
|
||||
0.6
|
||||
>>> print(masi_distance(s1, s2))
|
||||
0.868
|
||||
|
||||
----------------------
|
||||
Miscellaneous Measures
|
||||
----------------------
|
||||
|
||||
Rank Correlation works with two dictionaries mapping keys to ranks.
|
||||
The dictionaries should have the same set of keys.
|
||||
|
||||
>>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3})
|
||||
0.5
|
||||
|
||||
Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings).
|
||||
Segmentations are represented using strings of zeros and ones.
|
||||
|
||||
>>> s1 = "000100000010"
|
||||
>>> s2 = "000010000100"
|
||||
>>> s3 = "100000010000"
|
||||
>>> s4 = "000000000000"
|
||||
>>> s5 = "111111111111"
|
||||
>>> windowdiff(s1, s1, 3)
|
||||
0.0
|
||||
>>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3
|
||||
True
|
||||
>>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8
|
||||
True
|
||||
>>> windowdiff(s1, s4, 3)
|
||||
0.5
|
||||
>>> windowdiff(s1, s5, 3)
|
||||
1.0
|
||||
|
||||
----------------
|
||||
Confusion Matrix
|
||||
----------------
|
||||
|
||||
>>> reference = 'This is the reference data. Testing 123. aoaeoeoe'
|
||||
>>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe'
|
||||
>>> print(ConfusionMatrix(reference, test))
|
||||
| . 1 2 3 T _ a c d e f g h i n o r s t z |
|
||||
--+-------------------------------------------+
|
||||
|<8>. . . . . 1 . . . . . . . . . . . . . . |
|
||||
. | .<2>. . . . . . . . . . . . . . . . . . . |
|
||||
1 | . .<1>. . . . . . . . . . . . . . . . . . |
|
||||
2 | . . .<1>. . . . . . . . . . . . . . . . . |
|
||||
3 | . . . .<1>. . . . . . . . . . . . . . . . |
|
||||
T | . . . . .<2>. . . . . . . . . . . . . . . |
|
||||
_ | . . . . . .<.>. . . . . . . . . . . . . . |
|
||||
a | . . . . . . .<4>. . . . . . . . . . . . . |
|
||||
c | . . . . . . . .<1>. . . . . . . . . . . . |
|
||||
d | . . . . . . . . .<1>. . . . . . . . . . . |
|
||||
e | . . . . . . . . . .<6>. . . 3 . . . . . . |
|
||||
f | . . . . . . . . . . .<1>. . . . . . . . . |
|
||||
g | . . . . . . . . . . . .<1>. . . . . . . . |
|
||||
h | . . . . . . . . . . . . .<2>. . . . . . . |
|
||||
i | . . . . . . . . . . 1 . . .<1>. 1 . . . . |
|
||||
n | . . . . . . . . . . . . . . .<2>. . . . . |
|
||||
o | . . . . . . . . . . . . . . . .<3>. . . . |
|
||||
r | . . . . . . . . . . . . . . . . .<2>. . . |
|
||||
s | . . . . . . . . . . . . . . . . . .<2>. 1 |
|
||||
t | . . . . . . . . . . . . . . . . . . .<3>. |
|
||||
z | . . . . . . . . . . . . . . . . . . . .<.>|
|
||||
--+-------------------------------------------+
|
||||
(row = reference; col = test)
|
||||
<BLANKLINE>
|
||||
|
||||
>>> cm = ConfusionMatrix(reference, test)
|
||||
>>> print(cm.pretty_format(sort_by_count=True))
|
||||
| e a i o s t . T h n r 1 2 3 c d f g _ z |
|
||||
--+-------------------------------------------+
|
||||
|<8>. . . . . . . . . . . . . . . . . . 1 . |
|
||||
e | .<6>. 3 . . . . . . . . . . . . . . . . . |
|
||||
a | . .<4>. . . . . . . . . . . . . . . . . . |
|
||||
i | . 1 .<1>1 . . . . . . . . . . . . . . . . |
|
||||
o | . . . .<3>. . . . . . . . . . . . . . . . |
|
||||
s | . . . . .<2>. . . . . . . . . . . . . . 1 |
|
||||
t | . . . . . .<3>. . . . . . . . . . . . . . |
|
||||
. | . . . . . . .<2>. . . . . . . . . . . . . |
|
||||
T | . . . . . . . .<2>. . . . . . . . . . . . |
|
||||
h | . . . . . . . . .<2>. . . . . . . . . . . |
|
||||
n | . . . . . . . . . .<2>. . . . . . . . . . |
|
||||
r | . . . . . . . . . . .<2>. . . . . . . . . |
|
||||
1 | . . . . . . . . . . . .<1>. . . . . . . . |
|
||||
2 | . . . . . . . . . . . . .<1>. . . . . . . |
|
||||
3 | . . . . . . . . . . . . . .<1>. . . . . . |
|
||||
c | . . . . . . . . . . . . . . .<1>. . . . . |
|
||||
d | . . . . . . . . . . . . . . . .<1>. . . . |
|
||||
f | . . . . . . . . . . . . . . . . .<1>. . . |
|
||||
g | . . . . . . . . . . . . . . . . . .<1>. . |
|
||||
_ | . . . . . . . . . . . . . . . . . . .<.>. |
|
||||
z | . . . . . . . . . . . . . . . . . . . .<.>|
|
||||
--+-------------------------------------------+
|
||||
(row = reference; col = test)
|
||||
<BLANKLINE>
|
||||
|
||||
>>> print(cm.pretty_format(sort_by_count=True, truncate=10))
|
||||
| e a i o s t . T h |
|
||||
--+---------------------+
|
||||
|<8>. . . . . . . . . |
|
||||
e | .<6>. 3 . . . . . . |
|
||||
a | . .<4>. . . . . . . |
|
||||
i | . 1 .<1>1 . . . . . |
|
||||
o | . . . .<3>. . . . . |
|
||||
s | . . . . .<2>. . . . |
|
||||
t | . . . . . .<3>. . . |
|
||||
. | . . . . . . .<2>. . |
|
||||
T | . . . . . . . .<2>. |
|
||||
h | . . . . . . . . .<2>|
|
||||
--+---------------------+
|
||||
(row = reference; col = test)
|
||||
<BLANKLINE>
|
||||
|
||||
>>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False))
|
||||
| 1 |
|
||||
| 1 2 3 4 5 6 7 8 9 0 |
|
||||
---+---------------------+
|
||||
1 |<8>. . . . . . . . . |
|
||||
2 | .<6>. 3 . . . . . . |
|
||||
3 | . .<4>. . . . . . . |
|
||||
4 | . 1 .<1>1 . . . . . |
|
||||
5 | . . . .<3>. . . . . |
|
||||
6 | . . . . .<2>. . . . |
|
||||
7 | . . . . . .<3>. . . |
|
||||
8 | . . . . . . .<2>. . |
|
||||
9 | . . . . . . . .<2>. |
|
||||
10 | . . . . . . . . .<2>|
|
||||
---+---------------------+
|
||||
(row = reference; col = test)
|
||||
Value key:
|
||||
1:
|
||||
2: e
|
||||
3: a
|
||||
4: i
|
||||
5: o
|
||||
6: s
|
||||
7: t
|
||||
8: .
|
||||
9: T
|
||||
10: h
|
||||
<BLANKLINE>
|
||||
|
||||
For "e", the number of true positives should be 6, while the number of false negatives is 3.
|
||||
So, the recall ought to be 6 / (6 + 3):
|
||||
|
||||
>>> cm.recall("e") # doctest: +ELLIPSIS
|
||||
0.666666...
|
||||
|
||||
For "e", the false positive is just 1, so the precision should be 6 / (6 + 1):
|
||||
|
||||
>>> cm.precision("e") # doctest: +ELLIPSIS
|
||||
0.857142...
|
||||
|
||||
The f-measure with default value of ``alpha = 0.5`` should then be:
|
||||
|
||||
* *1/(alpha/p + (1-alpha)/r) =*
|
||||
* *1/(0.5/p + 0.5/r) =*
|
||||
* *2pr / (p + r) =*
|
||||
* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =*
|
||||
* *0.749999...*
|
||||
|
||||
>>> cm.f_measure("e") # doctest: +ELLIPSIS
|
||||
0.749999...
|
||||
|
||||
--------------------
|
||||
Association measures
|
||||
--------------------
|
||||
|
||||
These measures are useful to determine whether the coocurrence of two random
|
||||
events is meaningful. They are used, for instance, to distinguish collocations
|
||||
from other pairs of adjacent words.
|
||||
|
||||
We bring some examples of bigram association calculations from Manning and
|
||||
Schutze's SNLP, 2nd Ed. chapter 5.
|
||||
|
||||
>>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668
|
||||
>>> bam = BigramAssocMeasures
|
||||
>>> bam.raw_freq(20, (42, 20), N) == 20. / N
|
||||
True
|
||||
>>> bam.student_t(n_new_companies, (n_new, n_companies), N)
|
||||
0.999...
|
||||
>>> bam.chi_sq(n_new_companies, (n_new, n_companies), N)
|
||||
1.54...
|
||||
>>> bam.likelihood_ratio(150, (12593, 932), N)
|
||||
1291...
|
||||
|
||||
For other associations, we ensure the ordering of the measures:
|
||||
|
||||
>>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N)
|
||||
True
|
||||
>>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N)
|
||||
True
|
||||
>>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N)
|
||||
True
|
||||
>>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N)
|
||||
True
|
||||
>>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N)
|
||||
True
|
||||
>>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N)
|
||||
True
|
||||
>>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP
|
||||
False
|
||||
|
||||
For trigrams, we have to provide more count information:
|
||||
|
||||
>>> n_w1_w2_w3 = 20
|
||||
>>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
|
||||
>>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
|
||||
>>> n_w1, n_w2, n_w3 = 100, 200, 300
|
||||
>>> uni_counts = (n_w1, n_w2, n_w3)
|
||||
>>> N = 14307668
|
||||
>>> tam = TrigramAssocMeasures
|
||||
>>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N
|
||||
True
|
||||
>>> uni_counts2 = (n_w1, n_w2, 100)
|
||||
>>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
>>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
>>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
>>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
>>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
>>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
>>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N)
|
||||
True
|
||||
|
||||
|
||||
For fourgrams, we have to provide more count information:
|
||||
|
||||
>>> n_w1_w2_w3_w4 = 5
|
||||
>>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40
|
||||
>>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10
|
||||
>>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3)
|
||||
>>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4)
|
||||
>>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400
|
||||
>>> uni_counts = (n_w1, n_w2, n_w3, n_w4)
|
||||
>>> N = 14307668
|
||||
>>> qam = QuadgramAssocMeasures
|
||||
>>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N
|
||||
True
|
||||
118
Backend/venv/lib/python3.12/site-packages/nltk/test/misc.doctest
Normal file
118
Backend/venv/lib/python3.12/site-packages/nltk/test/misc.doctest
Normal file
@@ -0,0 +1,118 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
Unit tests for the miscellaneous sort functions.
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
>>> from copy import deepcopy
|
||||
>>> from nltk.misc.sort import *
|
||||
|
||||
A (very) small list of unsorted integers.
|
||||
|
||||
>>> test_data = [12, 67, 7, 28, 92, 56, 53, 720, 91, 57, 20, 20]
|
||||
|
||||
Test each sorting method - each method returns the number of operations
|
||||
required to sort the data, and sorts in-place (desctructively - hence the need
|
||||
for multiple copies).
|
||||
|
||||
>>> sorted_data = deepcopy(test_data)
|
||||
>>> selection(sorted_data)
|
||||
66
|
||||
|
||||
>>> sorted_data
|
||||
[7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
|
||||
|
||||
>>> sorted_data = deepcopy(test_data)
|
||||
>>> bubble(sorted_data)
|
||||
30
|
||||
|
||||
>>> sorted_data
|
||||
[7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
|
||||
|
||||
>>> sorted_data = deepcopy(test_data)
|
||||
>>> merge(sorted_data)
|
||||
30
|
||||
|
||||
>>> sorted_data
|
||||
[7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
|
||||
|
||||
>>> sorted_data = deepcopy(test_data)
|
||||
>>> quick(sorted_data)
|
||||
13
|
||||
|
||||
>>> sorted_data
|
||||
[7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720]
|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
Unit tests for Wordfinder class
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
>>> import random
|
||||
|
||||
>>> # The following is not enough for reproducibility under Python 2/3
|
||||
>>> # (see https://bugs.python.org/issue9025) so this test is skipped.
|
||||
>>> random.seed(12345)
|
||||
|
||||
>>> from nltk.misc import wordfinder
|
||||
>>> wordfinder.word_finder() # doctest: +SKIP
|
||||
Word Finder
|
||||
<BLANKLINE>
|
||||
J V L A I R O T A T I S I V O D E R E T
|
||||
H U U B E A R O E P O C S O R E T N E P
|
||||
A D A U Z E E S R A P P A L L M E N T R
|
||||
C X A D Q S Z T P E O R S N G P J A D E
|
||||
I G Y K K T I A A R G F I D T E L C N S
|
||||
R E C N B H T R L T N N B W N T A O A I
|
||||
A Y I L O E I A M E I A A Y U R P L L D
|
||||
G L T V S T S F E A D I P H D O O H N I
|
||||
R L S E C I N I L R N N M E C G R U E A
|
||||
A A Y G I C E N L L E O I G Q R T A E L
|
||||
M R C E T I S T A E T L L E U A E N R L
|
||||
O U O T A S E E C S O O N H Y P A T G Y
|
||||
E M H O M M D R E S F P U L T H C F N V
|
||||
L A C A I M A M A N L B R U T E D O M I
|
||||
O R I L N E E E E E U A R S C R Y L I P
|
||||
H T R K E S N N M S I L A S R E V I N U
|
||||
T X T A A O U T K S E T A R R E S I B J
|
||||
A E D L E L J I F O O R P E L K N I R W
|
||||
K H A I D E Q O P R I C K T I M B E R P
|
||||
Z K D O O H G N I H T U R V E Y D R O P
|
||||
<BLANKLINE>
|
||||
1: INTERCHANGER
|
||||
2: TEARLESSNESS
|
||||
3: UNIVERSALISM
|
||||
4: DESENSITIZER
|
||||
5: INTERMENTION
|
||||
6: TRICHOCYSTIC
|
||||
7: EXTRAMURALLY
|
||||
8: VEGETOALKALI
|
||||
9: PALMELLACEAE
|
||||
10: AESTHETICISM
|
||||
11: PETROGRAPHER
|
||||
12: VISITATORIAL
|
||||
13: OLEOMARGARIC
|
||||
14: WRINKLEPROOF
|
||||
15: PRICKTIMBER
|
||||
16: PRESIDIALLY
|
||||
17: SCITAMINEAE
|
||||
18: ENTEROSCOPE
|
||||
19: APPALLMENT
|
||||
20: TURVEYDROP
|
||||
21: THINGHOOD
|
||||
22: BISERRATE
|
||||
23: GREENLAND
|
||||
24: BRUTEDOM
|
||||
25: POLONIAN
|
||||
26: ACOLHUAN
|
||||
27: LAPORTEA
|
||||
28: TENDING
|
||||
29: TEREDO
|
||||
30: MESOLE
|
||||
31: UNLIMP
|
||||
32: OSTARA
|
||||
33: PILY
|
||||
34: DUNT
|
||||
35: ONYX
|
||||
36: KATH
|
||||
37: JUNE
|
||||
@@ -0,0 +1,293 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
======================
|
||||
Nonmonotonic Reasoning
|
||||
======================
|
||||
|
||||
>>> from nltk.test.setup_fixt import check_binary
|
||||
>>> check_binary('mace4')
|
||||
|
||||
>>> from nltk import *
|
||||
>>> from nltk.inference.nonmonotonic import *
|
||||
>>> from nltk.sem import logic
|
||||
>>> logic._counter._value = 0
|
||||
>>> read_expr = logic.Expression.fromstring
|
||||
|
||||
------------------------
|
||||
Closed Domain Assumption
|
||||
------------------------
|
||||
|
||||
The only entities in the domain are those found in the assumptions or goal.
|
||||
If the domain only contains "A" and "B", then the expression "exists x.P(x)" can
|
||||
be replaced with "P(A) | P(B)" and an expression "all x.P(x)" can be replaced
|
||||
with "P(A) & P(B)".
|
||||
|
||||
>>> p1 = read_expr(r'all x.(man(x) -> mortal(x))')
|
||||
>>> p2 = read_expr(r'man(Socrates)')
|
||||
>>> c = read_expr(r'mortal(Socrates)')
|
||||
>>> prover = Prover9Command(c, [p1,p2])
|
||||
>>> prover.prove()
|
||||
True
|
||||
>>> cdp = ClosedDomainProver(prover)
|
||||
>>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
|
||||
(man(Socrates) -> mortal(Socrates))
|
||||
man(Socrates)
|
||||
>>> cdp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'exists x.walk(x)')
|
||||
>>> p2 = read_expr(r'man(Socrates)')
|
||||
>>> c = read_expr(r'walk(Socrates)')
|
||||
>>> prover = Prover9Command(c, [p1,p2])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> cdp = ClosedDomainProver(prover)
|
||||
>>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
|
||||
walk(Socrates)
|
||||
man(Socrates)
|
||||
>>> cdp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'exists x.walk(x)')
|
||||
>>> p2 = read_expr(r'man(Socrates)')
|
||||
>>> p3 = read_expr(r'-walk(Bill)')
|
||||
>>> c = read_expr(r'walk(Socrates)')
|
||||
>>> prover = Prover9Command(c, [p1,p2,p3])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> cdp = ClosedDomainProver(prover)
|
||||
>>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
|
||||
(walk(Socrates) | walk(Bill))
|
||||
man(Socrates)
|
||||
-walk(Bill)
|
||||
>>> cdp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'walk(Socrates)')
|
||||
>>> p2 = read_expr(r'walk(Bill)')
|
||||
>>> c = read_expr(r'all x.walk(x)')
|
||||
>>> prover = Prover9Command(c, [p1,p2])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> cdp = ClosedDomainProver(prover)
|
||||
>>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
|
||||
walk(Socrates)
|
||||
walk(Bill)
|
||||
>>> print(cdp.goal()) # doctest: +SKIP
|
||||
(walk(Socrates) & walk(Bill))
|
||||
>>> cdp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'girl(mary)')
|
||||
>>> p2 = read_expr(r'dog(rover)')
|
||||
>>> p3 = read_expr(r'all x.(girl(x) -> -dog(x))')
|
||||
>>> p4 = read_expr(r'all x.(dog(x) -> -girl(x))')
|
||||
>>> p5 = read_expr(r'chase(mary, rover)')
|
||||
>>> c = read_expr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))')
|
||||
>>> prover = Prover9Command(c, [p1,p2,p3,p4,p5])
|
||||
>>> print(prover.prove())
|
||||
False
|
||||
>>> cdp = ClosedDomainProver(prover)
|
||||
>>> for a in cdp.assumptions(): print(a) # doctest: +SKIP
|
||||
girl(mary)
|
||||
dog(rover)
|
||||
((girl(rover) -> -dog(rover)) & (girl(mary) -> -dog(mary)))
|
||||
((dog(rover) -> -girl(rover)) & (dog(mary) -> -girl(mary)))
|
||||
chase(mary,rover)
|
||||
>>> print(cdp.goal()) # doctest: +SKIP
|
||||
((dog(rover) & (girl(rover) -> chase(rover,rover)) & (girl(mary) -> chase(mary,rover))) | (dog(mary) & (girl(rover) -> chase(rover,mary)) & (girl(mary) -> chase(mary,mary))))
|
||||
>>> print(cdp.prove())
|
||||
True
|
||||
|
||||
-----------------------
|
||||
Unique Names Assumption
|
||||
-----------------------
|
||||
|
||||
No two entities in the domain represent the same entity unless it can be
|
||||
explicitly proven that they do. Therefore, if the domain contains "A" and "B",
|
||||
then add the assumption "-(A = B)" if it is not the case that
|
||||
"<assumptions> \|- (A = B)".
|
||||
|
||||
>>> p1 = read_expr(r'man(Socrates)')
|
||||
>>> p2 = read_expr(r'man(Bill)')
|
||||
>>> c = read_expr(r'exists x.exists y.-(x = y)')
|
||||
>>> prover = Prover9Command(c, [p1,p2])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> unp = UniqueNamesProver(prover)
|
||||
>>> for a in unp.assumptions(): print(a) # doctest: +SKIP
|
||||
man(Socrates)
|
||||
man(Bill)
|
||||
-(Socrates = Bill)
|
||||
>>> unp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'all x.(walk(x) -> (x = Socrates))')
|
||||
>>> p2 = read_expr(r'Bill = William')
|
||||
>>> p3 = read_expr(r'Bill = Billy')
|
||||
>>> c = read_expr(r'-walk(William)')
|
||||
>>> prover = Prover9Command(c, [p1,p2,p3])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> unp = UniqueNamesProver(prover)
|
||||
>>> for a in unp.assumptions(): print(a) # doctest: +SKIP
|
||||
all x.(walk(x) -> (x = Socrates))
|
||||
(Bill = William)
|
||||
(Bill = Billy)
|
||||
-(William = Socrates)
|
||||
-(Billy = Socrates)
|
||||
-(Socrates = Bill)
|
||||
>>> unp.prove()
|
||||
True
|
||||
|
||||
-----------------------
|
||||
Closed World Assumption
|
||||
-----------------------
|
||||
|
||||
The only entities that have certain properties are those that is it stated
|
||||
have the properties. We accomplish this assumption by "completing" predicates.
|
||||
|
||||
If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion
|
||||
of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then
|
||||
"all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the
|
||||
assumptions don't contain anything that are "P", then "all x.-P(x)" is the
|
||||
completion of "P".
|
||||
|
||||
>>> p1 = read_expr(r'walk(Socrates)')
|
||||
>>> p2 = read_expr(r'-(Socrates = Bill)')
|
||||
>>> c = read_expr(r'-walk(Bill)')
|
||||
>>> prover = Prover9Command(c, [p1,p2])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> cwp = ClosedWorldProver(prover)
|
||||
>>> for a in cwp.assumptions(): print(a) # doctest: +SKIP
|
||||
walk(Socrates)
|
||||
-(Socrates = Bill)
|
||||
all z1.(walk(z1) -> (z1 = Socrates))
|
||||
>>> cwp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'see(Socrates, John)')
|
||||
>>> p2 = read_expr(r'see(John, Mary)')
|
||||
>>> p3 = read_expr(r'-(Socrates = John)')
|
||||
>>> p4 = read_expr(r'-(John = Mary)')
|
||||
>>> c = read_expr(r'-see(Socrates, Mary)')
|
||||
>>> prover = Prover9Command(c, [p1,p2,p3,p4])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> cwp = ClosedWorldProver(prover)
|
||||
>>> for a in cwp.assumptions(): print(a) # doctest: +SKIP
|
||||
see(Socrates,John)
|
||||
see(John,Mary)
|
||||
-(Socrates = John)
|
||||
-(John = Mary)
|
||||
all z3 z4.(see(z3,z4) -> (((z3 = Socrates) & (z4 = John)) | ((z3 = John) & (z4 = Mary))))
|
||||
>>> cwp.prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr(r'all x.(ostrich(x) -> bird(x))')
|
||||
>>> p2 = read_expr(r'bird(Tweety)')
|
||||
>>> p3 = read_expr(r'-ostrich(Sam)')
|
||||
>>> p4 = read_expr(r'Sam != Tweety')
|
||||
>>> c = read_expr(r'-bird(Sam)')
|
||||
>>> prover = Prover9Command(c, [p1,p2,p3,p4])
|
||||
>>> prover.prove()
|
||||
False
|
||||
>>> cwp = ClosedWorldProver(prover)
|
||||
>>> for a in cwp.assumptions(): print(a) # doctest: +SKIP
|
||||
all x.(ostrich(x) -> bird(x))
|
||||
bird(Tweety)
|
||||
-ostrich(Sam)
|
||||
-(Sam = Tweety)
|
||||
all z7.-ostrich(z7)
|
||||
all z8.(bird(z8) -> ((z8 = Tweety) | ostrich(z8)))
|
||||
>>> print(cwp.prove())
|
||||
True
|
||||
|
||||
-----------------------
|
||||
Multi-Decorator Example
|
||||
-----------------------
|
||||
|
||||
Decorators can be nested to utilize multiple assumptions.
|
||||
|
||||
>>> p1 = read_expr(r'see(Socrates, John)')
|
||||
>>> p2 = read_expr(r'see(John, Mary)')
|
||||
>>> c = read_expr(r'-see(Socrates, Mary)')
|
||||
>>> prover = Prover9Command(c, [p1,p2])
|
||||
>>> print(prover.prove())
|
||||
False
|
||||
>>> cmd = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover)))
|
||||
>>> print(cmd.prove())
|
||||
True
|
||||
|
||||
-----------------
|
||||
Default Reasoning
|
||||
-----------------
|
||||
>>> logic._counter._value = 0
|
||||
>>> premises = []
|
||||
|
||||
define the taxonomy
|
||||
|
||||
>>> premises.append(read_expr(r'all x.(elephant(x) -> animal(x))'))
|
||||
>>> premises.append(read_expr(r'all x.(bird(x) -> animal(x))'))
|
||||
>>> premises.append(read_expr(r'all x.(dove(x) -> bird(x))'))
|
||||
>>> premises.append(read_expr(r'all x.(ostrich(x) -> bird(x))'))
|
||||
>>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> ostrich(x))'))
|
||||
|
||||
default the properties using abnormalities
|
||||
|
||||
>>> premises.append(read_expr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly
|
||||
>>> premises.append(read_expr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly
|
||||
>>> premises.append(read_expr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly
|
||||
|
||||
specify abnormal entities
|
||||
|
||||
>>> premises.append(read_expr(r'all x.(bird(x) -> Ab1(x))')) #flight
|
||||
>>> premises.append(read_expr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird
|
||||
>>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich
|
||||
|
||||
define entities
|
||||
|
||||
>>> premises.append(read_expr(r'elephant(el)'))
|
||||
>>> premises.append(read_expr(r'dove(do)'))
|
||||
>>> premises.append(read_expr(r'ostrich(os)'))
|
||||
|
||||
print the augmented assumptions list
|
||||
|
||||
>>> prover = Prover9Command(None, premises)
|
||||
>>> command = UniqueNamesProver(ClosedWorldProver(prover))
|
||||
>>> for a in command.assumptions(): print(a) # doctest: +SKIP
|
||||
all x.(elephant(x) -> animal(x))
|
||||
all x.(bird(x) -> animal(x))
|
||||
all x.(dove(x) -> bird(x))
|
||||
all x.(ostrich(x) -> bird(x))
|
||||
all x.(flying_ostrich(x) -> ostrich(x))
|
||||
all x.((animal(x) & -Ab1(x)) -> -fly(x))
|
||||
all x.((bird(x) & -Ab2(x)) -> fly(x))
|
||||
all x.((ostrich(x) & -Ab3(x)) -> -fly(x))
|
||||
all x.(bird(x) -> Ab1(x))
|
||||
all x.(ostrich(x) -> Ab2(x))
|
||||
all x.(flying_ostrich(x) -> Ab3(x))
|
||||
elephant(el)
|
||||
dove(do)
|
||||
ostrich(os)
|
||||
all z1.(animal(z1) -> (elephant(z1) | bird(z1)))
|
||||
all z2.(Ab1(z2) -> bird(z2))
|
||||
all z3.(bird(z3) -> (dove(z3) | ostrich(z3)))
|
||||
all z4.(dove(z4) -> (z4 = do))
|
||||
all z5.(Ab2(z5) -> ostrich(z5))
|
||||
all z6.(Ab3(z6) -> flying_ostrich(z6))
|
||||
all z7.(ostrich(z7) -> ((z7 = os) | flying_ostrich(z7)))
|
||||
all z8.-flying_ostrich(z8)
|
||||
all z9.(elephant(z9) -> (z9 = el))
|
||||
-(el = os)
|
||||
-(el = do)
|
||||
-(os = do)
|
||||
|
||||
>>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(el)'), premises))).prove()
|
||||
True
|
||||
>>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('fly(do)'), premises))).prove()
|
||||
True
|
||||
>>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(os)'), premises))).prove()
|
||||
True
|
||||
@@ -0,0 +1,35 @@
|
||||
|
||||
=====================================================
|
||||
PAICE's evaluation statistics for stemming algorithms
|
||||
=====================================================
|
||||
|
||||
Given a list of words with their real lemmas and stems according to stemming algorithm under evaluation,
|
||||
counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT).
|
||||
|
||||
>>> from nltk.metrics import Paice
|
||||
|
||||
|
||||
-------------------------------------
|
||||
Understemming and Overstemming values
|
||||
-------------------------------------
|
||||
|
||||
>>> lemmas = {'kneel': ['kneel', 'knelt'],
|
||||
... 'range': ['range', 'ranged'],
|
||||
... 'ring': ['ring', 'rang', 'rung']}
|
||||
>>> stems = {'kneel': ['kneel'],
|
||||
... 'knelt': ['knelt'],
|
||||
... 'rang': ['rang', 'range', 'ranged'],
|
||||
... 'ring': ['ring'],
|
||||
... 'rung': ['rung']}
|
||||
>>> p = Paice(lemmas, stems)
|
||||
>>> p.gumt, p.gdmt, p.gwmt, p.gdnt
|
||||
(4.0, 5.0, 2.0, 16.0)
|
||||
|
||||
>>> p.ui, p.oi, p.sw
|
||||
(0.8..., 0.125..., 0.15625...)
|
||||
|
||||
>>> p.errt
|
||||
1.0
|
||||
|
||||
>>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords]
|
||||
[('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')]
|
||||
@@ -0,0 +1,933 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=========
|
||||
Parsing
|
||||
=========
|
||||
|
||||
Unit tests for the Context Free Grammar class
|
||||
---------------------------------------------
|
||||
|
||||
>>> import pickle
|
||||
>>> import subprocess
|
||||
>>> import sys
|
||||
>>> from nltk import Nonterminal, nonterminals, Production, CFG
|
||||
|
||||
>>> nt1 = Nonterminal('NP')
|
||||
>>> nt2 = Nonterminal('VP')
|
||||
|
||||
>>> nt1.symbol()
|
||||
'NP'
|
||||
|
||||
>>> nt1 == Nonterminal('NP')
|
||||
True
|
||||
|
||||
>>> nt1 == nt2
|
||||
False
|
||||
|
||||
>>> S, NP, VP, PP = nonterminals('S, NP, VP, PP')
|
||||
>>> N, V, P, DT = nonterminals('N, V, P, DT')
|
||||
|
||||
>>> prod1 = Production(S, [NP, VP])
|
||||
>>> prod2 = Production(NP, [DT, NP])
|
||||
|
||||
>>> prod1.lhs()
|
||||
S
|
||||
|
||||
>>> prod1.rhs()
|
||||
(NP, VP)
|
||||
|
||||
>>> prod1 == Production(S, [NP, VP])
|
||||
True
|
||||
|
||||
>>> prod1 == prod2
|
||||
False
|
||||
|
||||
>>> grammar = CFG.fromstring("""
|
||||
... S -> NP VP
|
||||
... PP -> P NP
|
||||
... NP -> 'the' N | N PP | 'the' N PP
|
||||
... VP -> V NP | V PP | V NP PP
|
||||
... N -> 'cat'
|
||||
... N -> 'dog'
|
||||
... N -> 'rug'
|
||||
... V -> 'chased'
|
||||
... V -> 'sat'
|
||||
... P -> 'in'
|
||||
... P -> 'on'
|
||||
... """)
|
||||
|
||||
>>> cmd = """import pickle
|
||||
... from nltk import Production
|
||||
... p = Production('S', ['NP', 'VP'])
|
||||
... print(pickle.dumps(p))
|
||||
... """
|
||||
|
||||
>>> # Start a subprocess to simulate pickling in another process
|
||||
>>> proc = subprocess.run([sys.executable, '-c', cmd], stdout=subprocess.PIPE)
|
||||
>>> p1 = pickle.loads(eval(proc.stdout))
|
||||
>>> p2 = Production('S', ['NP', 'VP'])
|
||||
>>> print(hash(p1) == hash(p2))
|
||||
True
|
||||
|
||||
Unit tests for the rd (Recursive Descent Parser) class
|
||||
------------------------------------------------------
|
||||
|
||||
Create and run a recursive descent parser over both a syntactically ambiguous
|
||||
and unambiguous sentence.
|
||||
|
||||
>>> from nltk.parse import RecursiveDescentParser
|
||||
>>> rd = RecursiveDescentParser(grammar)
|
||||
|
||||
>>> sentence1 = 'the cat chased the dog'.split()
|
||||
>>> sentence2 = 'the cat chased the dog on the rug'.split()
|
||||
|
||||
>>> for t in rd.parse(sentence1):
|
||||
... print(t)
|
||||
(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))
|
||||
|
||||
>>> for t in rd.parse(sentence2):
|
||||
... print(t)
|
||||
(S
|
||||
(NP the (N cat))
|
||||
(VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
|
||||
(S
|
||||
(NP the (N cat))
|
||||
(VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))
|
||||
|
||||
|
||||
(dolist (expr doctest-font-lock-keywords)
|
||||
(add-to-list 'font-lock-keywords expr))
|
||||
|
||||
font-lock-keywords
|
||||
(add-to-list 'font-lock-keywords
|
||||
(car doctest-font-lock-keywords))
|
||||
|
||||
|
||||
Unit tests for the sr (Shift Reduce Parser) class
|
||||
-------------------------------------------------
|
||||
|
||||
Create and run a shift reduce parser over both a syntactically ambiguous
|
||||
and unambiguous sentence. Note that unlike the recursive descent parser, one
|
||||
and only one parse is ever returned.
|
||||
|
||||
>>> from nltk.parse import ShiftReduceParser
|
||||
>>> sr = ShiftReduceParser(grammar)
|
||||
|
||||
>>> sentence1 = 'the cat chased the dog'.split()
|
||||
>>> sentence2 = 'the cat chased the dog on the rug'.split()
|
||||
|
||||
>>> for t in sr.parse(sentence1):
|
||||
... print(t)
|
||||
(S (NP the (N cat)) (VP (V chased) (NP the (N dog))))
|
||||
|
||||
|
||||
The shift reduce parser uses heuristics to decide what to do when there are
|
||||
multiple possible shift or reduce operations available - for the supplied
|
||||
grammar clearly the wrong operation is selected.
|
||||
|
||||
>>> for t in sr.parse(sentence2):
|
||||
... print(t)
|
||||
|
||||
|
||||
Unit tests for the Chart Parser class
|
||||
-------------------------------------
|
||||
|
||||
We use the demo() function for testing.
|
||||
We must turn off showing of times.
|
||||
|
||||
>>> import nltk
|
||||
|
||||
First we test tracing with a short sentence
|
||||
|
||||
>>> nltk.parse.chart.demo(2, print_times=False, trace=1,
|
||||
... sent='I saw a dog', numparses=1)
|
||||
* Sentence:
|
||||
I saw a dog
|
||||
['I', 'saw', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
* Strategy: Bottom-up
|
||||
<BLANKLINE>
|
||||
|. I . saw . a . dog .|
|
||||
|[---------] . . .| [0:1] 'I'
|
||||
|. [---------] . .| [1:2] 'saw'
|
||||
|. . [---------] .| [2:3] 'a'
|
||||
|. . . [---------]| [3:4] 'dog'
|
||||
|> . . . .| [0:0] NP -> * 'I'
|
||||
|[---------] . . .| [0:1] NP -> 'I' *
|
||||
|> . . . .| [0:0] S -> * NP VP
|
||||
|> . . . .| [0:0] NP -> * NP PP
|
||||
|[---------> . . .| [0:1] S -> NP * VP
|
||||
|[---------> . . .| [0:1] NP -> NP * PP
|
||||
|. > . . .| [1:1] Verb -> * 'saw'
|
||||
|. [---------] . .| [1:2] Verb -> 'saw' *
|
||||
|. > . . .| [1:1] VP -> * Verb NP
|
||||
|. > . . .| [1:1] VP -> * Verb
|
||||
|. [---------> . .| [1:2] VP -> Verb * NP
|
||||
|. [---------] . .| [1:2] VP -> Verb *
|
||||
|. > . . .| [1:1] VP -> * VP PP
|
||||
|[-------------------] . .| [0:2] S -> NP VP *
|
||||
|. [---------> . .| [1:2] VP -> VP * PP
|
||||
|. . > . .| [2:2] Det -> * 'a'
|
||||
|. . [---------] .| [2:3] Det -> 'a' *
|
||||
|. . > . .| [2:2] NP -> * Det Noun
|
||||
|. . [---------> .| [2:3] NP -> Det * Noun
|
||||
|. . . > .| [3:3] Noun -> * 'dog'
|
||||
|. . . [---------]| [3:4] Noun -> 'dog' *
|
||||
|. . [-------------------]| [2:4] NP -> Det Noun *
|
||||
|. . > . .| [2:2] S -> * NP VP
|
||||
|. . > . .| [2:2] NP -> * NP PP
|
||||
|. [-----------------------------]| [1:4] VP -> Verb NP *
|
||||
|. . [------------------->| [2:4] S -> NP * VP
|
||||
|. . [------------------->| [2:4] NP -> NP * PP
|
||||
|[=======================================]| [0:4] S -> NP VP *
|
||||
|. [----------------------------->| [1:4] VP -> VP * PP
|
||||
Nr edges in chart: 33
|
||||
(S (NP I) (VP (Verb saw) (NP (Det a) (Noun dog))))
|
||||
<BLANKLINE>
|
||||
|
||||
Then we test the different parsing Strategies.
|
||||
Note that the number of edges differ between the strategies.
|
||||
|
||||
Top-down
|
||||
|
||||
>>> nltk.parse.chart.demo(1, print_times=False, trace=0,
|
||||
... sent='I saw John with a dog', numparses=2)
|
||||
* Sentence:
|
||||
I saw John with a dog
|
||||
['I', 'saw', 'John', 'with', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
* Strategy: Top-down
|
||||
<BLANKLINE>
|
||||
Nr edges in chart: 48
|
||||
(S
|
||||
(NP I)
|
||||
(VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
|
||||
(S
|
||||
(NP I)
|
||||
(VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
|
||||
<BLANKLINE>
|
||||
|
||||
Bottom-up
|
||||
|
||||
>>> nltk.parse.chart.demo(2, print_times=False, trace=0,
|
||||
... sent='I saw John with a dog', numparses=2)
|
||||
* Sentence:
|
||||
I saw John with a dog
|
||||
['I', 'saw', 'John', 'with', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
* Strategy: Bottom-up
|
||||
<BLANKLINE>
|
||||
Nr edges in chart: 53
|
||||
(S
|
||||
(NP I)
|
||||
(VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
|
||||
(S
|
||||
(NP I)
|
||||
(VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
|
||||
<BLANKLINE>
|
||||
|
||||
Bottom-up Left-Corner
|
||||
|
||||
>>> nltk.parse.chart.demo(3, print_times=False, trace=0,
|
||||
... sent='I saw John with a dog', numparses=2)
|
||||
* Sentence:
|
||||
I saw John with a dog
|
||||
['I', 'saw', 'John', 'with', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
* Strategy: Bottom-up left-corner
|
||||
<BLANKLINE>
|
||||
Nr edges in chart: 36
|
||||
(S
|
||||
(NP I)
|
||||
(VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
|
||||
(S
|
||||
(NP I)
|
||||
(VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
|
||||
<BLANKLINE>
|
||||
|
||||
Left-Corner with Bottom-Up Filter
|
||||
|
||||
>>> nltk.parse.chart.demo(4, print_times=False, trace=0,
|
||||
... sent='I saw John with a dog', numparses=2)
|
||||
* Sentence:
|
||||
I saw John with a dog
|
||||
['I', 'saw', 'John', 'with', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
* Strategy: Filtered left-corner
|
||||
<BLANKLINE>
|
||||
Nr edges in chart: 28
|
||||
(S
|
||||
(NP I)
|
||||
(VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
|
||||
(S
|
||||
(NP I)
|
||||
(VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
|
||||
<BLANKLINE>
|
||||
|
||||
The stepping chart parser
|
||||
|
||||
>>> nltk.parse.chart.demo(5, print_times=False, trace=1,
|
||||
... sent='I saw John with a dog', numparses=2)
|
||||
* Sentence:
|
||||
I saw John with a dog
|
||||
['I', 'saw', 'John', 'with', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
* Strategy: Stepping (top-down vs bottom-up)
|
||||
<BLANKLINE>
|
||||
*** SWITCH TO TOP DOWN
|
||||
|[------] . . . . .| [0:1] 'I'
|
||||
|. [------] . . . .| [1:2] 'saw'
|
||||
|. . [------] . . .| [2:3] 'John'
|
||||
|. . . [------] . .| [3:4] 'with'
|
||||
|. . . . [------] .| [4:5] 'a'
|
||||
|. . . . . [------]| [5:6] 'dog'
|
||||
|> . . . . . .| [0:0] S -> * NP VP
|
||||
|> . . . . . .| [0:0] NP -> * NP PP
|
||||
|> . . . . . .| [0:0] NP -> * Det Noun
|
||||
|> . . . . . .| [0:0] NP -> * 'I'
|
||||
|[------] . . . . .| [0:1] NP -> 'I' *
|
||||
|[------> . . . . .| [0:1] S -> NP * VP
|
||||
|[------> . . . . .| [0:1] NP -> NP * PP
|
||||
|. > . . . . .| [1:1] VP -> * VP PP
|
||||
|. > . . . . .| [1:1] VP -> * Verb NP
|
||||
|. > . . . . .| [1:1] VP -> * Verb
|
||||
|. > . . . . .| [1:1] Verb -> * 'saw'
|
||||
|. [------] . . . .| [1:2] Verb -> 'saw' *
|
||||
|. [------> . . . .| [1:2] VP -> Verb * NP
|
||||
|. [------] . . . .| [1:2] VP -> Verb *
|
||||
|[-------------] . . . .| [0:2] S -> NP VP *
|
||||
|. [------> . . . .| [1:2] VP -> VP * PP
|
||||
*** SWITCH TO BOTTOM UP
|
||||
|. . > . . . .| [2:2] NP -> * 'John'
|
||||
|. . . > . . .| [3:3] PP -> * 'with' NP
|
||||
|. . . > . . .| [3:3] Prep -> * 'with'
|
||||
|. . . . > . .| [4:4] Det -> * 'a'
|
||||
|. . . . . > .| [5:5] Noun -> * 'dog'
|
||||
|. . [------] . . .| [2:3] NP -> 'John' *
|
||||
|. . . [------> . .| [3:4] PP -> 'with' * NP
|
||||
|. . . [------] . .| [3:4] Prep -> 'with' *
|
||||
|. . . . [------] .| [4:5] Det -> 'a' *
|
||||
|. . . . . [------]| [5:6] Noun -> 'dog' *
|
||||
|. [-------------] . . .| [1:3] VP -> Verb NP *
|
||||
|[--------------------] . . .| [0:3] S -> NP VP *
|
||||
|. [-------------> . . .| [1:3] VP -> VP * PP
|
||||
|. . > . . . .| [2:2] S -> * NP VP
|
||||
|. . > . . . .| [2:2] NP -> * NP PP
|
||||
|. . . . > . .| [4:4] NP -> * Det Noun
|
||||
|. . [------> . . .| [2:3] S -> NP * VP
|
||||
|. . [------> . . .| [2:3] NP -> NP * PP
|
||||
|. . . . [------> .| [4:5] NP -> Det * Noun
|
||||
|. . . . [-------------]| [4:6] NP -> Det Noun *
|
||||
|. . . [--------------------]| [3:6] PP -> 'with' NP *
|
||||
|. [----------------------------------]| [1:6] VP -> VP PP *
|
||||
*** SWITCH TO TOP DOWN
|
||||
|. . > . . . .| [2:2] NP -> * Det Noun
|
||||
|. . . . > . .| [4:4] NP -> * NP PP
|
||||
|. . . > . . .| [3:3] VP -> * VP PP
|
||||
|. . . > . . .| [3:3] VP -> * Verb NP
|
||||
|. . . > . . .| [3:3] VP -> * Verb
|
||||
|[=========================================]| [0:6] S -> NP VP *
|
||||
|. [---------------------------------->| [1:6] VP -> VP * PP
|
||||
|. . [---------------------------]| [2:6] NP -> NP PP *
|
||||
|. . . . [------------->| [4:6] NP -> NP * PP
|
||||
|. [----------------------------------]| [1:6] VP -> Verb NP *
|
||||
|. . [--------------------------->| [2:6] S -> NP * VP
|
||||
|. . [--------------------------->| [2:6] NP -> NP * PP
|
||||
|[=========================================]| [0:6] S -> NP VP *
|
||||
|. [---------------------------------->| [1:6] VP -> VP * PP
|
||||
|. . . . . . >| [6:6] VP -> * VP PP
|
||||
|. . . . . . >| [6:6] VP -> * Verb NP
|
||||
|. . . . . . >| [6:6] VP -> * Verb
|
||||
*** SWITCH TO BOTTOM UP
|
||||
|. . . . > . .| [4:4] S -> * NP VP
|
||||
|. . . . [------------->| [4:6] S -> NP * VP
|
||||
*** SWITCH TO TOP DOWN
|
||||
*** SWITCH TO BOTTOM UP
|
||||
*** SWITCH TO TOP DOWN
|
||||
*** SWITCH TO BOTTOM UP
|
||||
*** SWITCH TO TOP DOWN
|
||||
*** SWITCH TO BOTTOM UP
|
||||
Nr edges in chart: 61
|
||||
(S
|
||||
(NP I)
|
||||
(VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
|
||||
(S
|
||||
(NP I)
|
||||
(VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
|
||||
<BLANKLINE>
|
||||
|
||||
|
||||
Unit tests for the Incremental Chart Parser class
|
||||
-------------------------------------------------
|
||||
|
||||
The incremental chart parsers are defined in earleychart.py.
|
||||
We use the demo() function for testing. We must turn off showing of times.
|
||||
|
||||
>>> import nltk
|
||||
|
||||
Earley Chart Parser
|
||||
|
||||
>>> nltk.parse.earleychart.demo(print_times=False, trace=1,
|
||||
... sent='I saw John with a dog', numparses=2)
|
||||
* Sentence:
|
||||
I saw John with a dog
|
||||
['I', 'saw', 'John', 'with', 'a', 'dog']
|
||||
<BLANKLINE>
|
||||
|. I . saw . John . with . a . dog .|
|
||||
|[------] . . . . .| [0:1] 'I'
|
||||
|. [------] . . . .| [1:2] 'saw'
|
||||
|. . [------] . . .| [2:3] 'John'
|
||||
|. . . [------] . .| [3:4] 'with'
|
||||
|. . . . [------] .| [4:5] 'a'
|
||||
|. . . . . [------]| [5:6] 'dog'
|
||||
|> . . . . . .| [0:0] S -> * NP VP
|
||||
|> . . . . . .| [0:0] NP -> * NP PP
|
||||
|> . . . . . .| [0:0] NP -> * Det Noun
|
||||
|> . . . . . .| [0:0] NP -> * 'I'
|
||||
|[------] . . . . .| [0:1] NP -> 'I' *
|
||||
|[------> . . . . .| [0:1] S -> NP * VP
|
||||
|[------> . . . . .| [0:1] NP -> NP * PP
|
||||
|. > . . . . .| [1:1] VP -> * VP PP
|
||||
|. > . . . . .| [1:1] VP -> * Verb NP
|
||||
|. > . . . . .| [1:1] VP -> * Verb
|
||||
|. > . . . . .| [1:1] Verb -> * 'saw'
|
||||
|. [------] . . . .| [1:2] Verb -> 'saw' *
|
||||
|. [------> . . . .| [1:2] VP -> Verb * NP
|
||||
|. [------] . . . .| [1:2] VP -> Verb *
|
||||
|[-------------] . . . .| [0:2] S -> NP VP *
|
||||
|. [------> . . . .| [1:2] VP -> VP * PP
|
||||
|. . > . . . .| [2:2] NP -> * NP PP
|
||||
|. . > . . . .| [2:2] NP -> * Det Noun
|
||||
|. . > . . . .| [2:2] NP -> * 'John'
|
||||
|. . [------] . . .| [2:3] NP -> 'John' *
|
||||
|. [-------------] . . .| [1:3] VP -> Verb NP *
|
||||
|. . [------> . . .| [2:3] NP -> NP * PP
|
||||
|. . . > . . .| [3:3] PP -> * 'with' NP
|
||||
|[--------------------] . . .| [0:3] S -> NP VP *
|
||||
|. [-------------> . . .| [1:3] VP -> VP * PP
|
||||
|. . . [------> . .| [3:4] PP -> 'with' * NP
|
||||
|. . . . > . .| [4:4] NP -> * NP PP
|
||||
|. . . . > . .| [4:4] NP -> * Det Noun
|
||||
|. . . . > . .| [4:4] Det -> * 'a'
|
||||
|. . . . [------] .| [4:5] Det -> 'a' *
|
||||
|. . . . [------> .| [4:5] NP -> Det * Noun
|
||||
|. . . . . > .| [5:5] Noun -> * 'dog'
|
||||
|. . . . . [------]| [5:6] Noun -> 'dog' *
|
||||
|. . . . [-------------]| [4:6] NP -> Det Noun *
|
||||
|. . . [--------------------]| [3:6] PP -> 'with' NP *
|
||||
|. . . . [------------->| [4:6] NP -> NP * PP
|
||||
|. . [---------------------------]| [2:6] NP -> NP PP *
|
||||
|. [----------------------------------]| [1:6] VP -> VP PP *
|
||||
|[=========================================]| [0:6] S -> NP VP *
|
||||
|. [---------------------------------->| [1:6] VP -> VP * PP
|
||||
|. [----------------------------------]| [1:6] VP -> Verb NP *
|
||||
|. . [--------------------------->| [2:6] NP -> NP * PP
|
||||
|[=========================================]| [0:6] S -> NP VP *
|
||||
|. [---------------------------------->| [1:6] VP -> VP * PP
|
||||
(S
|
||||
(NP I)
|
||||
(VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog)))))
|
||||
(S
|
||||
(NP I)
|
||||
(VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog))))))
|
||||
|
||||
|
||||
Unit tests for LARGE context-free grammars
|
||||
------------------------------------------
|
||||
|
||||
Reading the ATIS grammar.
|
||||
|
||||
>>> grammar = nltk.data.load('grammars/large_grammars/atis.cfg')
|
||||
>>> grammar
|
||||
<Grammar with 5517 productions>
|
||||
|
||||
Reading the test sentences.
|
||||
|
||||
>>> sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
|
||||
>>> sentences = nltk.parse.util.extract_test_sentences(sentences)
|
||||
>>> len(sentences)
|
||||
98
|
||||
>>> testsentence = sentences[22]
|
||||
>>> testsentence[0]
|
||||
['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.']
|
||||
>>> testsentence[1]
|
||||
17
|
||||
>>> sentence = testsentence[0]
|
||||
|
||||
Now we test all different parsing strategies.
|
||||
Note that the number of edges differ between the strategies.
|
||||
|
||||
Bottom-up parsing.
|
||||
|
||||
>>> parser = nltk.parse.BottomUpChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
7661
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Bottom-up Left-corner parsing.
|
||||
|
||||
>>> parser = nltk.parse.BottomUpLeftCornerChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
4986
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Left-corner parsing with bottom-up filter.
|
||||
|
||||
>>> parser = nltk.parse.LeftCornerChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
1342
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Top-down parsing.
|
||||
|
||||
>>> parser = nltk.parse.TopDownChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
28352
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Incremental Bottom-up parsing.
|
||||
|
||||
>>> parser = nltk.parse.IncrementalBottomUpChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
7661
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Incremental Bottom-up Left-corner parsing.
|
||||
|
||||
>>> parser = nltk.parse.IncrementalBottomUpLeftCornerChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
4986
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Incremental Left-corner parsing with bottom-up filter.
|
||||
|
||||
>>> parser = nltk.parse.IncrementalLeftCornerChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
1342
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Incremental Top-down parsing.
|
||||
|
||||
>>> parser = nltk.parse.IncrementalTopDownChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
28352
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
Earley parsing. This is similar to the incremental top-down algorithm.
|
||||
|
||||
>>> parser = nltk.parse.EarleyChartParser(grammar)
|
||||
>>> chart = parser.chart_parse(sentence)
|
||||
>>> print((chart.num_edges()))
|
||||
28352
|
||||
>>> print((len(list(chart.parses(grammar.start())))))
|
||||
17
|
||||
|
||||
|
||||
Unit tests for the Probabilistic CFG class
|
||||
------------------------------------------
|
||||
|
||||
>>> from nltk.corpus import treebank
|
||||
>>> from itertools import islice
|
||||
>>> from nltk.grammar import PCFG, induce_pcfg
|
||||
>>> toy_pcfg1 = PCFG.fromstring("""
|
||||
... S -> NP VP [1.0]
|
||||
... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
|
||||
... Det -> 'the' [0.8] | 'my' [0.2]
|
||||
... N -> 'man' [0.5] | 'telescope' [0.5]
|
||||
... VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
|
||||
... V -> 'ate' [0.35] | 'saw' [0.65]
|
||||
... PP -> P NP [1.0]
|
||||
... P -> 'with' [0.61] | 'under' [0.39]
|
||||
... """)
|
||||
|
||||
>>> toy_pcfg2 = PCFG.fromstring("""
|
||||
... S -> NP VP [1.0]
|
||||
... VP -> V NP [.59]
|
||||
... VP -> V [.40]
|
||||
... VP -> VP PP [.01]
|
||||
... NP -> Det N [.41]
|
||||
... NP -> Name [.28]
|
||||
... NP -> NP PP [.31]
|
||||
... PP -> P NP [1.0]
|
||||
... V -> 'saw' [.21]
|
||||
... V -> 'ate' [.51]
|
||||
... V -> 'ran' [.28]
|
||||
... N -> 'boy' [.11]
|
||||
... N -> 'cookie' [.12]
|
||||
... N -> 'table' [.13]
|
||||
... N -> 'telescope' [.14]
|
||||
... N -> 'hill' [.5]
|
||||
... Name -> 'Jack' [.52]
|
||||
... Name -> 'Bob' [.48]
|
||||
... P -> 'with' [.61]
|
||||
... P -> 'under' [.39]
|
||||
... Det -> 'the' [.41]
|
||||
... Det -> 'a' [.31]
|
||||
... Det -> 'my' [.28]
|
||||
... """)
|
||||
|
||||
Create a set of PCFG productions.
|
||||
|
||||
>>> grammar = PCFG.fromstring("""
|
||||
... A -> B B [.3] | C B C [.7]
|
||||
... B -> B D [.5] | C [.5]
|
||||
... C -> 'a' [.1] | 'b' [0.9]
|
||||
... D -> 'b' [1.0]
|
||||
... """)
|
||||
>>> prod = grammar.productions()[0]
|
||||
>>> prod
|
||||
A -> B B [0.3]
|
||||
|
||||
>>> prod.lhs()
|
||||
A
|
||||
|
||||
>>> prod.rhs()
|
||||
(B, B)
|
||||
|
||||
>>> print((prod.prob()))
|
||||
0.3
|
||||
|
||||
>>> grammar.start()
|
||||
A
|
||||
|
||||
>>> grammar.productions()
|
||||
[A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]]
|
||||
|
||||
Induce some productions using parsed Treebank data.
|
||||
|
||||
>>> productions = []
|
||||
>>> for fileid in treebank.fileids()[:2]:
|
||||
... for t in treebank.parsed_sents(fileid):
|
||||
... productions += t.productions()
|
||||
|
||||
>>> grammar = induce_pcfg(S, productions)
|
||||
>>> grammar
|
||||
<Grammar with 71 productions>
|
||||
|
||||
>>> sorted(grammar.productions(lhs=Nonterminal('PP')))[:2]
|
||||
[PP -> IN NP [1.0]]
|
||||
>>> sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2]
|
||||
[NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]]
|
||||
>>> sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2]
|
||||
[JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]]
|
||||
>>> sorted(grammar.productions(lhs=Nonterminal('NP')))[:2]
|
||||
[NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]]
|
||||
|
||||
Unit tests for the Probabilistic Chart Parse classes
|
||||
----------------------------------------------------
|
||||
|
||||
>>> tokens = "Jack saw Bob with my cookie".split()
|
||||
>>> grammar = toy_pcfg2
|
||||
>>> print(grammar)
|
||||
Grammar with 23 productions (start state = S)
|
||||
S -> NP VP [1.0]
|
||||
VP -> V NP [0.59]
|
||||
VP -> V [0.4]
|
||||
VP -> VP PP [0.01]
|
||||
NP -> Det N [0.41]
|
||||
NP -> Name [0.28]
|
||||
NP -> NP PP [0.31]
|
||||
PP -> P NP [1.0]
|
||||
V -> 'saw' [0.21]
|
||||
V -> 'ate' [0.51]
|
||||
V -> 'ran' [0.28]
|
||||
N -> 'boy' [0.11]
|
||||
N -> 'cookie' [0.12]
|
||||
N -> 'table' [0.13]
|
||||
N -> 'telescope' [0.14]
|
||||
N -> 'hill' [0.5]
|
||||
Name -> 'Jack' [0.52]
|
||||
Name -> 'Bob' [0.48]
|
||||
P -> 'with' [0.61]
|
||||
P -> 'under' [0.39]
|
||||
Det -> 'the' [0.41]
|
||||
Det -> 'a' [0.31]
|
||||
Det -> 'my' [0.28]
|
||||
|
||||
Create several parsers using different queuing strategies and show the
|
||||
resulting parses.
|
||||
|
||||
>>> from nltk.parse import pchart
|
||||
|
||||
>>> parser = pchart.InsideChartParser(grammar)
|
||||
>>> for t in parser.parse(tokens):
|
||||
... print(t)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(V saw)
|
||||
(NP
|
||||
(NP (Name Bob))
|
||||
(PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(VP (V saw) (NP (Name Bob)))
|
||||
(PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
|
||||
|
||||
>>> parser = pchart.RandomChartParser(grammar)
|
||||
>>> for t in parser.parse(tokens):
|
||||
... print(t)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(V saw)
|
||||
(NP
|
||||
(NP (Name Bob))
|
||||
(PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(VP (V saw) (NP (Name Bob)))
|
||||
(PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
|
||||
|
||||
>>> parser = pchart.UnsortedChartParser(grammar)
|
||||
>>> for t in parser.parse(tokens):
|
||||
... print(t)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(V saw)
|
||||
(NP
|
||||
(NP (Name Bob))
|
||||
(PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(VP (V saw) (NP (Name Bob)))
|
||||
(PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
|
||||
|
||||
>>> parser = pchart.LongestChartParser(grammar)
|
||||
>>> for t in parser.parse(tokens):
|
||||
... print(t)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(V saw)
|
||||
(NP
|
||||
(NP (Name Bob))
|
||||
(PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(VP (V saw) (NP (Name Bob)))
|
||||
(PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07)
|
||||
|
||||
>>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1)
|
||||
>>> for t in parser.parse(tokens):
|
||||
... print(t)
|
||||
|
||||
|
||||
Unit tests for the Viterbi Parse classes
|
||||
----------------------------------------
|
||||
|
||||
>>> from nltk.parse import ViterbiParser
|
||||
>>> tokens = "Jack saw Bob with my cookie".split()
|
||||
>>> grammar = toy_pcfg2
|
||||
|
||||
Parse the tokenized sentence.
|
||||
|
||||
>>> parser = ViterbiParser(grammar)
|
||||
>>> for t in parser.parse(tokens):
|
||||
... print(t)
|
||||
(S
|
||||
(NP (Name Jack))
|
||||
(VP
|
||||
(V saw)
|
||||
(NP
|
||||
(NP (Name Bob))
|
||||
(PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06)
|
||||
|
||||
|
||||
Unit tests for the FeatStructNonterminal class
|
||||
----------------------------------------------
|
||||
|
||||
>>> from nltk.grammar import FeatStructNonterminal
|
||||
>>> FeatStructNonterminal(
|
||||
... pos='n', agr=FeatStructNonterminal(number='pl', gender='f'))
|
||||
[agr=[gender='f', number='pl'], pos='n']
|
||||
|
||||
>>> FeatStructNonterminal('VP[+fin]/NP[+pl]')
|
||||
VP[+fin]/NP[+pl]
|
||||
|
||||
|
||||
Tracing the Feature Chart Parser
|
||||
--------------------------------
|
||||
|
||||
We use the featurechart.demo() function for tracing the Feature Chart Parser.
|
||||
|
||||
>>> nltk.parse.featurechart.demo(print_times=False,
|
||||
... print_grammar=True,
|
||||
... parser=nltk.parse.featurechart.FeatureChartParser,
|
||||
... sent='I saw John with a dog')
|
||||
<BLANKLINE>
|
||||
Grammar with 18 productions (start state = S[])
|
||||
S[] -> NP[] VP[]
|
||||
PP[] -> Prep[] NP[]
|
||||
NP[] -> NP[] PP[]
|
||||
VP[] -> VP[] PP[]
|
||||
VP[] -> Verb[] NP[]
|
||||
VP[] -> Verb[]
|
||||
NP[] -> Det[pl=?x] Noun[pl=?x]
|
||||
NP[] -> 'John'
|
||||
NP[] -> 'I'
|
||||
Det[] -> 'the'
|
||||
Det[] -> 'my'
|
||||
Det[-pl] -> 'a'
|
||||
Noun[-pl] -> 'dog'
|
||||
Noun[-pl] -> 'cookie'
|
||||
Verb[] -> 'ate'
|
||||
Verb[] -> 'saw'
|
||||
Prep[] -> 'with'
|
||||
Prep[] -> 'under'
|
||||
<BLANKLINE>
|
||||
* FeatureChartParser
|
||||
Sentence: I saw John with a dog
|
||||
|.I.s.J.w.a.d.|
|
||||
|[-] . . . . .| [0:1] 'I'
|
||||
|. [-] . . . .| [1:2] 'saw'
|
||||
|. . [-] . . .| [2:3] 'John'
|
||||
|. . . [-] . .| [3:4] 'with'
|
||||
|. . . . [-] .| [4:5] 'a'
|
||||
|. . . . . [-]| [5:6] 'dog'
|
||||
|[-] . . . . .| [0:1] NP[] -> 'I' *
|
||||
|[-> . . . . .| [0:1] S[] -> NP[] * VP[] {}
|
||||
|[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {}
|
||||
|. [-] . . . .| [1:2] Verb[] -> 'saw' *
|
||||
|. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {}
|
||||
|. [-] . . . .| [1:2] VP[] -> Verb[] *
|
||||
|. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {}
|
||||
|[---] . . . .| [0:2] S[] -> NP[] VP[] *
|
||||
|. . [-] . . .| [2:3] NP[] -> 'John' *
|
||||
|. . [-> . . .| [2:3] S[] -> NP[] * VP[] {}
|
||||
|. . [-> . . .| [2:3] NP[] -> NP[] * PP[] {}
|
||||
|. [---] . . .| [1:3] VP[] -> Verb[] NP[] *
|
||||
|. [---> . . .| [1:3] VP[] -> VP[] * PP[] {}
|
||||
|[-----] . . .| [0:3] S[] -> NP[] VP[] *
|
||||
|. . . [-] . .| [3:4] Prep[] -> 'with' *
|
||||
|. . . [-> . .| [3:4] PP[] -> Prep[] * NP[] {}
|
||||
|. . . . [-] .| [4:5] Det[-pl] -> 'a' *
|
||||
|. . . . [-> .| [4:5] NP[] -> Det[pl=?x] * Noun[pl=?x] {?x: False}
|
||||
|. . . . . [-]| [5:6] Noun[-pl] -> 'dog' *
|
||||
|. . . . [---]| [4:6] NP[] -> Det[-pl] Noun[-pl] *
|
||||
|. . . . [--->| [4:6] S[] -> NP[] * VP[] {}
|
||||
|. . . . [--->| [4:6] NP[] -> NP[] * PP[] {}
|
||||
|. . . [-----]| [3:6] PP[] -> Prep[] NP[] *
|
||||
|. . [-------]| [2:6] NP[] -> NP[] PP[] *
|
||||
|. [---------]| [1:6] VP[] -> VP[] PP[] *
|
||||
|. [--------->| [1:6] VP[] -> VP[] * PP[] {}
|
||||
|[===========]| [0:6] S[] -> NP[] VP[] *
|
||||
|. . [------->| [2:6] S[] -> NP[] * VP[] {}
|
||||
|. . [------->| [2:6] NP[] -> NP[] * PP[] {}
|
||||
|. [---------]| [1:6] VP[] -> Verb[] NP[] *
|
||||
|. [--------->| [1:6] VP[] -> VP[] * PP[] {}
|
||||
|[===========]| [0:6] S[] -> NP[] VP[] *
|
||||
(S[]
|
||||
(NP[] I)
|
||||
(VP[]
|
||||
(VP[] (Verb[] saw) (NP[] John))
|
||||
(PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog)))))
|
||||
(S[]
|
||||
(NP[] I)
|
||||
(VP[]
|
||||
(Verb[] saw)
|
||||
(NP[]
|
||||
(NP[] John)
|
||||
(PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog))))))
|
||||
|
||||
|
||||
Unit tests for the Feature Chart Parser classes
|
||||
-----------------------------------------------
|
||||
|
||||
The list of parsers we want to test.
|
||||
|
||||
>>> parsers = [nltk.parse.featurechart.FeatureChartParser,
|
||||
... nltk.parse.featurechart.FeatureTopDownChartParser,
|
||||
... nltk.parse.featurechart.FeatureBottomUpChartParser,
|
||||
... nltk.parse.featurechart.FeatureBottomUpLeftCornerChartParser,
|
||||
... nltk.parse.earleychart.FeatureIncrementalChartParser,
|
||||
... nltk.parse.earleychart.FeatureEarleyChartParser,
|
||||
... nltk.parse.earleychart.FeatureIncrementalTopDownChartParser,
|
||||
... nltk.parse.earleychart.FeatureIncrementalBottomUpChartParser,
|
||||
... nltk.parse.earleychart.FeatureIncrementalBottomUpLeftCornerChartParser,
|
||||
... ]
|
||||
|
||||
A helper function that tests each parser on the given grammar and sentence.
|
||||
We check that the number of trees are correct, and that all parsers
|
||||
return the same trees. Otherwise an error is printed.
|
||||
|
||||
>>> def unittest(grammar, sentence, nr_trees):
|
||||
... sentence = sentence.split()
|
||||
... trees = None
|
||||
... for P in parsers:
|
||||
... result = P(grammar).parse(sentence)
|
||||
... result = set(tree.freeze() for tree in result)
|
||||
... if len(result) != nr_trees:
|
||||
... print("Wrong nr of trees:", len(result))
|
||||
... elif trees is None:
|
||||
... trees = result
|
||||
... elif result != trees:
|
||||
... print("Trees differ for parser:", P.__name__)
|
||||
|
||||
The demo grammar from before, with an ambiguous sentence.
|
||||
|
||||
>>> isawjohn = nltk.parse.featurechart.demo_grammar()
|
||||
>>> unittest(isawjohn, "I saw John with a dog with my cookie", 5)
|
||||
|
||||
This grammar tests that variables in different grammar rules are renamed
|
||||
before unification. (The problematic variable is in this case ?X).
|
||||
|
||||
>>> whatwasthat = nltk.grammar.FeatureGrammar.fromstring('''
|
||||
... S[] -> NP[num=?N] VP[num=?N, slash=?X]
|
||||
... NP[num=?X] -> "what"
|
||||
... NP[num=?X] -> "that"
|
||||
... VP[num=?P, slash=none] -> V[num=?P] NP[]
|
||||
... V[num=sg] -> "was"
|
||||
... ''')
|
||||
>>> unittest(whatwasthat, "what was that", 1)
|
||||
|
||||
This grammar tests that the same rule can be used in different places
|
||||
in another rule, and that the variables are properly renamed.
|
||||
|
||||
>>> thislovesthat = nltk.grammar.FeatureGrammar.fromstring('''
|
||||
... S[] -> NP[case=nom] V[] NP[case=acc]
|
||||
... NP[case=?X] -> Pron[case=?X]
|
||||
... Pron[] -> "this"
|
||||
... Pron[] -> "that"
|
||||
... V[] -> "loves"
|
||||
... ''')
|
||||
>>> unittest(thislovesthat, "this loves that", 1)
|
||||
|
||||
|
||||
Tests for loading feature grammar files
|
||||
---------------------------------------
|
||||
|
||||
Alternative 1: first load the grammar, then create the parser.
|
||||
|
||||
>>> fcfg = nltk.data.load('grammars/book_grammars/feat0.fcfg')
|
||||
>>> fcp1 = nltk.parse.FeatureChartParser(fcfg)
|
||||
>>> print((type(fcp1)))
|
||||
<class 'nltk.parse.featurechart.FeatureChartParser'>
|
||||
|
||||
Alternative 2: directly load the parser.
|
||||
|
||||
>>> fcp2 = nltk.parse.load_parser('grammars/book_grammars/feat0.fcfg')
|
||||
>>> print((type(fcp2)))
|
||||
<class 'nltk.parse.featurechart.FeatureChartParser'>
|
||||
@@ -0,0 +1,572 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==================================
|
||||
Examples for Portuguese Processing
|
||||
==================================
|
||||
|
||||
This HOWTO contains a variety of examples relating to the Portuguese language.
|
||||
It is intended to be read in conjunction with the NLTK book
|
||||
(``https://www.nltk.org/book/``). For instructions on running the Python
|
||||
interpreter, please see the section *Getting Started with Python*, in Chapter 1.
|
||||
|
||||
--------------------------------------------
|
||||
Python Programming, with Portuguese Examples
|
||||
--------------------------------------------
|
||||
|
||||
Chapter 1 of the NLTK book contains many elementary programming examples, all
|
||||
with English texts. In this section, we'll see some corresponding examples
|
||||
using Portuguese. Please refer to the chapter for full discussion. *Vamos!*
|
||||
|
||||
>>> from nltk.test.portuguese_en_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
>>> from nltk.examples.pt import *
|
||||
*** Introductory Examples for the NLTK Book ***
|
||||
Loading ptext1, ... and psent1, ...
|
||||
Type the name of the text or sentence to view it.
|
||||
Type: 'texts()' or 'sents()' to list the materials.
|
||||
ptext1: Memórias Póstumas de Brás Cubas (1881)
|
||||
ptext2: Dom Casmurro (1899)
|
||||
ptext3: Gênesis
|
||||
ptext4: Folha de Sao Paulo (1994)
|
||||
|
||||
|
||||
Any time we want to find out about these texts, we just have
|
||||
to enter their names at the Python prompt:
|
||||
|
||||
>>> ptext2
|
||||
<Text: Dom Casmurro (1899)>
|
||||
|
||||
Searching Text
|
||||
--------------
|
||||
|
||||
A concordance permits us to see words in context.
|
||||
|
||||
>>> ptext1.concordance('olhos')
|
||||
Building index...
|
||||
Displaying 25 of 138 matches:
|
||||
De pé , à cabeceira da cama , com os olhos estúpidos , a boca entreaberta , a t
|
||||
orelhas . Pela minha parte fechei os olhos e deixei - me ir à ventura . Já agor
|
||||
xões de cérebro enfermo . Como ia de olhos fechados , não via o caminho ; lembr
|
||||
gelos eternos . Com efeito , abri os olhos e vi que o meu animal galopava numa
|
||||
me apareceu então , fitando - me uns olhos rutilantes como o sol . Tudo nessa f
|
||||
mim mesmo . Então , encarei - a com olhos súplices , e pedi mais alguns anos .
|
||||
...
|
||||
|
||||
For a given word, we can find words with a similar text distribution:
|
||||
|
||||
>>> ptext1.similar('chegar')
|
||||
Building word-context index...
|
||||
acabada acudir aludir avistar bramanismo casamento cheguei com contar
|
||||
contrário corpo dali deixei desferirem dizer fazer filhos já leitor lhe
|
||||
>>> ptext3.similar('chegar')
|
||||
Building word-context index...
|
||||
achar alumiar arrombar destruir governar guardar ir lavrar passar que
|
||||
toda tomar ver vir
|
||||
|
||||
We can search for the statistically significant collocations in a text:
|
||||
|
||||
>>> ptext1.collocations()
|
||||
Building collocations list
|
||||
Quincas Borba; Lobo Neves; alguma coisa; Brás Cubas; meu pai; dia
|
||||
seguinte; não sei; Meu pai; alguns instantes; outra vez; outra coisa;
|
||||
por exemplo; mim mesmo; coisa nenhuma; mesma coisa; não era; dias
|
||||
depois; Passeio Público; olhar para; das coisas
|
||||
|
||||
We can search for words in context, with the help of *regular expressions*, e.g.:
|
||||
|
||||
>>> ptext1.findall("<olhos> (<.*>)")
|
||||
estúpidos; e; fechados; rutilantes; súplices; a; do; babavam;
|
||||
na; moles; se; da; umas; espraiavam; chamejantes; espetados;
|
||||
...
|
||||
|
||||
We can automatically generate random text based on a given text, e.g.:
|
||||
|
||||
>>> ptext3.generate() # doctest: +SKIP
|
||||
No princípio , criou Deus os abençoou , dizendo : Onde { estão } e até
|
||||
à ave dos céus , { que } será . Disse mais Abrão : Dá - me a mulher
|
||||
que tomaste ; porque daquele poço Eseque , { tinha .} E disse : Não
|
||||
poderemos descer ; mas , do campo ainda não estava na casa do teu
|
||||
pescoço . E viveu Serugue , depois Simeão e Levi { são } estes ? E o
|
||||
varão , porque habitava na terra de Node , da mão de Esaú : Jeús ,
|
||||
Jalão e Corá
|
||||
|
||||
Texts as List of Words
|
||||
----------------------
|
||||
|
||||
A few sentences have been defined for you.
|
||||
|
||||
>>> psent1
|
||||
['o', 'amor', 'da', 'gl\xf3ria', 'era', 'a', 'coisa', 'mais',
|
||||
'verdadeiramente', 'humana', 'que', 'h\xe1', 'no', 'homem', ',',
|
||||
'e', ',', 'conseq\xfcentemente', ',', 'a', 'sua', 'mais',
|
||||
'genu\xedna', 'fei\xe7\xe3o', '.']
|
||||
>>>
|
||||
|
||||
Notice that the sentence has been *tokenized*. Each token is
|
||||
represented as a string, represented using quotes, e.g. ``'coisa'``.
|
||||
Some strings contain special characters, e.g. ``\xf3``,
|
||||
the internal representation for ó.
|
||||
The tokens are combined in the form of a *list*. How long is this list?
|
||||
|
||||
>>> len(psent1)
|
||||
25
|
||||
>>>
|
||||
|
||||
What is the vocabulary of this sentence?
|
||||
|
||||
>>> sorted(set(psent1))
|
||||
[',', '.', 'a', 'amor', 'coisa', 'conseqüentemente', 'da', 'e', 'era',
|
||||
'feição', 'genuína', 'glória', 'homem', 'humana', 'há', 'mais', 'no',
|
||||
'o', 'que', 'sua', 'verdadeiramente']
|
||||
>>>
|
||||
|
||||
Let's iterate over each item in ``psent2``, and print information for each:
|
||||
|
||||
>>> for w in psent2:
|
||||
... print(w, len(w), w[-1])
|
||||
...
|
||||
Não 3 o
|
||||
consultes 9 s
|
||||
dicionários 11 s
|
||||
. 1 .
|
||||
|
||||
Observe how we make a human-readable version of a string, using ``decode()``.
|
||||
Also notice that we accessed the last character of a string ``w`` using ``w[-1]``.
|
||||
|
||||
We just saw a ``for`` loop above. Another useful control structure is a
|
||||
*list comprehension*.
|
||||
|
||||
>>> [w.upper() for w in psent2]
|
||||
['N\xc3O', 'CONSULTES', 'DICION\xc1RIOS', '.']
|
||||
>>> [w for w in psent1 if w.endswith('a')]
|
||||
['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna']
|
||||
>>> [w for w in ptext4 if len(w) > 15]
|
||||
['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro',
|
||||
'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente']
|
||||
|
||||
We can examine the relative frequency of words in a text, using ``FreqDist``:
|
||||
|
||||
>>> fd1 = FreqDist(ptext1)
|
||||
>>> fd1
|
||||
<FreqDist with 10848 samples and 77098 outcomes>
|
||||
>>> fd1['olhos']
|
||||
137
|
||||
>>> fd1.max()
|
||||
','
|
||||
>>> fd1.samples()[:100]
|
||||
[',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o',
|
||||
'\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu',
|
||||
'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?',
|
||||
'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia',
|
||||
'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito',
|
||||
'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem',
|
||||
'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem',
|
||||
'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia',
|
||||
't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma',
|
||||
'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim',
|
||||
'ent\xe3o', 'dizia', 'aos', 'Borba']
|
||||
|
||||
---------------
|
||||
Reading Corpora
|
||||
---------------
|
||||
|
||||
Accessing the Machado Text Corpus
|
||||
---------------------------------
|
||||
|
||||
NLTK includes the complete works of Machado de Assis.
|
||||
|
||||
>>> from nltk.corpus import machado
|
||||
>>> machado.fileids()
|
||||
['contos/macn001.txt', 'contos/macn002.txt', 'contos/macn003.txt', ...]
|
||||
|
||||
Each file corresponds to one of the works of Machado de Assis. To see a complete
|
||||
list of works, you can look at the corpus README file: ``print machado.readme()``.
|
||||
Let's access the text of the *Posthumous Memories of Brás Cubas*.
|
||||
|
||||
We can access the text as a list of characters, and access 200 characters starting
|
||||
from position 10,000.
|
||||
|
||||
>>> raw_text = machado.raw('romance/marm05.txt')
|
||||
>>> raw_text[10000:10200]
|
||||
u', primou no\nEstado, e foi um dos amigos particulares do vice-rei Conde
|
||||
da Cunha.\n\nComo este apelido de Cubas lhe\ncheirasse excessivamente a
|
||||
tanoaria, alegava meu pai, bisneto de Dami\xe3o, que o\ndito ape'
|
||||
|
||||
However, this is not a very useful way to work with a text. We generally think
|
||||
of a text as a sequence of words and punctuation, not characters:
|
||||
|
||||
>>> text1 = machado.words('romance/marm05.txt')
|
||||
>>> text1
|
||||
['Romance', ',', 'Mem\xf3rias', 'P\xf3stumas', 'de', ...]
|
||||
>>> len(text1)
|
||||
77098
|
||||
>>> len(set(text1))
|
||||
10848
|
||||
|
||||
Here's a program that finds the most common ngrams that contain a
|
||||
particular target word.
|
||||
|
||||
>>> from nltk import ngrams, FreqDist
|
||||
>>> target_word = 'olhos'
|
||||
>>> fd = FreqDist(ng
|
||||
... for ng in ngrams(text1, 5)
|
||||
... if target_word in ng)
|
||||
>>> for hit in fd.samples():
|
||||
... print(' '.join(hit))
|
||||
...
|
||||
, com os olhos no
|
||||
com os olhos no ar
|
||||
com os olhos no chão
|
||||
e todos com os olhos
|
||||
me estar com os olhos
|
||||
os olhos estúpidos , a
|
||||
os olhos na costura ,
|
||||
os olhos no ar ,
|
||||
, com os olhos espetados
|
||||
, com os olhos estúpidos
|
||||
, com os olhos fitos
|
||||
, com os olhos naquele
|
||||
, com os olhos para
|
||||
|
||||
|
||||
Accessing the MacMorpho Tagged Corpus
|
||||
-------------------------------------
|
||||
|
||||
NLTK includes the MAC-MORPHO Brazilian Portuguese POS-tagged news text,
|
||||
with over a million words of
|
||||
journalistic texts extracted from ten sections of
|
||||
the daily newspaper *Folha de Sao Paulo*, 1994.
|
||||
|
||||
We can access this corpus as a sequence of words or tagged words as follows:
|
||||
|
||||
>>> import nltk.corpus
|
||||
>>> nltk.corpus.mac_morpho.words()
|
||||
['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...]
|
||||
>>> nltk.corpus.mac_morpho.sents()
|
||||
[['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o',
|
||||
'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'],
|
||||
['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional',
|
||||
'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...]
|
||||
>>> nltk.corpus.mac_morpho.tagged_words()
|
||||
[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...]
|
||||
|
||||
We can also access it in sentence chunks.
|
||||
|
||||
>>> nltk.corpus.mac_morpho.tagged_sents()
|
||||
[[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'),
|
||||
('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'),
|
||||
('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'),
|
||||
('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'),
|
||||
('Paulo', 'NPROP')],
|
||||
[('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'),
|
||||
('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'),
|
||||
('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'),
|
||||
('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...]
|
||||
|
||||
This data can be used to train taggers (examples below for the Floresta treebank).
|
||||
|
||||
Accessing the Floresta Portuguese Treebank
|
||||
------------------------------------------
|
||||
|
||||
The NLTK data distribution includes the
|
||||
"Floresta Sinta(c)tica Corpus" version 7.4, available from
|
||||
``https://www.linguateca.pt/Floresta/``.
|
||||
|
||||
We can access this corpus as a sequence of words or tagged words as follows:
|
||||
|
||||
>>> from nltk.corpus import floresta
|
||||
>>> floresta.words()
|
||||
['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...]
|
||||
>>> floresta.tagged_words()
|
||||
[('Um', '>N+art'), ('revivalismo', 'H+n'), ...]
|
||||
|
||||
The tags consist of some syntactic information, followed by a plus sign,
|
||||
followed by a conventional part-of-speech tag. Let's strip off the material before
|
||||
the plus sign:
|
||||
|
||||
>>> def simplify_tag(t):
|
||||
... if "+" in t:
|
||||
... return t[t.index("+")+1:]
|
||||
... else:
|
||||
... return t
|
||||
>>> twords = floresta.tagged_words()
|
||||
>>> twords = [(w.lower(), simplify_tag(t)) for (w,t) in twords]
|
||||
>>> twords[:10]
|
||||
[('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'),
|
||||
('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')]
|
||||
|
||||
Pretty printing the tagged words:
|
||||
|
||||
>>> print(' '.join(word + '/' + tag for (word, tag) in twords[:10]))
|
||||
um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop é/v-fin um/art ex-libris/n de/prp a/art
|
||||
|
||||
Count the word tokens and types, and determine the most common word:
|
||||
|
||||
>>> words = floresta.words()
|
||||
>>> len(words)
|
||||
211852
|
||||
>>> fd = nltk.FreqDist(words)
|
||||
>>> len(fd)
|
||||
29421
|
||||
>>> fd.max()
|
||||
'de'
|
||||
|
||||
List the 20 most frequent tags, in order of decreasing frequency:
|
||||
|
||||
>>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()]
|
||||
>>> fd = nltk.FreqDist(tags)
|
||||
>>> fd.keys()[:20]
|
||||
['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.',
|
||||
'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp',
|
||||
'pron-pers', '\xab', '\xbb', 'conj-s', '}']
|
||||
|
||||
We can also access the corpus grouped by sentence:
|
||||
|
||||
>>> floresta.sents()
|
||||
[['Um', 'revivalismo', 'refrescante'],
|
||||
['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite',
|
||||
'algarvia', '.'], ...]
|
||||
>>> floresta.tagged_sents()
|
||||
[[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')],
|
||||
[('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'),
|
||||
('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'),
|
||||
('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')],
|
||||
...]
|
||||
>>> floresta.parsed_sents()
|
||||
[Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']),
|
||||
Tree('N<+adj', ['refrescante'])]),
|
||||
Tree('STA+fcl',
|
||||
[Tree('SUBJ+np', [Tree('>N+art', ['O']),
|
||||
Tree('H+prop', ['7_e_Meio'])]),
|
||||
Tree('P+v-fin', ['\xe9']),
|
||||
Tree('SC+np',
|
||||
[Tree('>N+art', ['um']),
|
||||
Tree('H+n', ['ex-libris']),
|
||||
Tree('N<+pp', [Tree('H+prp', ['de']),
|
||||
Tree('P<+np', [Tree('>N+art', ['a']),
|
||||
Tree('H+n', ['noite']),
|
||||
Tree('N<+adj', ['algarvia'])])])]),
|
||||
Tree('.', ['.'])]), ...]
|
||||
|
||||
To view a parse tree, use the ``draw()`` method, e.g.:
|
||||
|
||||
>>> psents = floresta.parsed_sents()
|
||||
>>> psents[5].draw() # doctest: +SKIP
|
||||
|
||||
Character Encodings
|
||||
-------------------
|
||||
|
||||
Python understands the common character encoding used for Portuguese, ISO 8859-1 (ISO Latin 1).
|
||||
|
||||
>>> import os, nltk.test
|
||||
>>> testdir = os.path.split(nltk.test.__file__)[0]
|
||||
>>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO 8859-1')
|
||||
>>> text[:60]
|
||||
'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais '
|
||||
>>> print(text[:60])
|
||||
O 7 e Meio é um ex-libris da noite algarvia.
|
||||
É uma das mais
|
||||
|
||||
For more information about character encodings and Python, please see section 3.3 of the book.
|
||||
|
||||
----------------
|
||||
Processing Tasks
|
||||
----------------
|
||||
|
||||
|
||||
Simple Concordancing
|
||||
--------------------
|
||||
|
||||
Here's a function that takes a word and a specified amount of context (measured
|
||||
in characters), and generates a concordance for that word.
|
||||
|
||||
>>> def concordance(word, context=30):
|
||||
... for sent in floresta.sents():
|
||||
... if word in sent:
|
||||
... pos = sent.index(word)
|
||||
... left = ' '.join(sent[:pos])
|
||||
... right = ' '.join(sent[pos+1:])
|
||||
... print('%*s %s %-*s' %
|
||||
... (context, left[-context:], word, context, right[:context]))
|
||||
|
||||
>>> concordance("dar") # doctest: +SKIP
|
||||
anduru , foi o suficiente para dar a volta a o resultado .
|
||||
1. O P?BLICO veio dar a a imprensa di?ria portuguesa
|
||||
A fartura de pensamento pode dar maus resultados e n?s n?o quer
|
||||
Come?a a dar resultados a pol?tica de a Uni
|
||||
ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se
|
||||
r com Constantino para ele lhe dar tamb?m os pap?is assinados .
|
||||
va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n?
|
||||
?rica como o ant?doto capaz de dar sentido a o seu enorme poder .
|
||||
. . .
|
||||
>>> concordance("vender") # doctest: +SKIP
|
||||
er recebido uma encomenda para vender 4000 blindados a o Iraque .
|
||||
m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r
|
||||
mpre ter jovens simp?ticos a ? vender ? chega ! }
|
||||
Disse que o governo vai vender ? desde autom?vel at? particip
|
||||
ndiciou ontem duas pessoas por vender carro com ?gio .
|
||||
A inten??o de Fleury ? vender as a??es para equilibrar as fi
|
||||
|
||||
Part-of-Speech Tagging
|
||||
----------------------
|
||||
|
||||
Let's begin by getting the tagged sentence data, and simplifying the tags
|
||||
as described earlier.
|
||||
|
||||
>>> from nltk.corpus import floresta
|
||||
>>> tsents = floresta.tagged_sents()
|
||||
>>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent]
|
||||
>>> train = tsents[100:]
|
||||
>>> test = tsents[:100]
|
||||
|
||||
We already know that ``n`` is the most common tag, so we can set up a
|
||||
default tagger that tags every word as a noun, and see how well it does:
|
||||
|
||||
>>> tagger0 = nltk.DefaultTagger('n')
|
||||
>>> nltk.tag.accuracy(tagger0, test)
|
||||
0.17697228144989338
|
||||
|
||||
Evidently, about one in every six words is a noun. Let's improve on this by
|
||||
training a unigram tagger:
|
||||
|
||||
>>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0)
|
||||
>>> nltk.tag.accuracy(tagger1, test)
|
||||
0.87029140014214645
|
||||
|
||||
Next a bigram tagger:
|
||||
|
||||
>>> tagger2 = nltk.BigramTagger(train, backoff=tagger1)
|
||||
>>> nltk.tag.accuracy(tagger2, test)
|
||||
0.89019189765458417
|
||||
|
||||
|
||||
Sentence Segmentation
|
||||
---------------------
|
||||
|
||||
Punkt is a language-neutral sentence segmentation tool. We
|
||||
|
||||
>>> from nltk.tokenize import PunktTokenizer
|
||||
>>> sent_tokenizer = PunktTokenizer("portuguese")
|
||||
|
||||
>>> raw_text = machado.raw('romance/marm05.txt')
|
||||
>>> sentences = sent_tokenizer.tokenize(raw_text)
|
||||
>>> for sent in sentences[1000:1005]:
|
||||
... print("<<", sent, ">>")
|
||||
...
|
||||
<< Em verdade, parecia ainda mais mulher do que era;
|
||||
seria criança nos seus folgares de moça; mas assim quieta, impassível, tinha a
|
||||
compostura da mulher casada. >>
|
||||
<< Talvez essa circunstância lhe diminuía um pouco da
|
||||
graça virginal. >>
|
||||
<< Depressa nos familiarizamos; a mãe fazia-lhe grandes elogios, eu
|
||||
escutava-os de boa sombra, e ela sorria com os olhos fúlgidos, como se lá dentro
|
||||
do cérebro lhe estivesse a voar uma borboletinha de asas de ouro e olhos de
|
||||
diamante... >>
|
||||
<< Digo lá dentro, porque cá fora o
|
||||
que esvoaçou foi uma borboleta preta, que subitamente penetrou na varanda, e
|
||||
começou a bater as asas em derredor de D. Eusébia. >>
|
||||
<< D. Eusébia deu um grito,
|
||||
levantou-se, praguejou umas palavras soltas: - T'esconjuro!... >>
|
||||
|
||||
The sentence tokenizer can be trained and evaluated on other text.
|
||||
The source text (from the Floresta Portuguese Treebank) contains one sentence per line.
|
||||
We read the text, split it into its lines, and then join these lines together using
|
||||
spaces. Now the information about sentence breaks has been discarded. We split this
|
||||
material into training and testing data:
|
||||
|
||||
>>> import os, nltk.test
|
||||
>>> testdir = os.path.split(nltk.test.__file__)[0]
|
||||
>>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO-8859-1')
|
||||
>>> lines = text.split('\n')
|
||||
>>> train = ' '.join(lines[10:])
|
||||
>>> test = ' '.join(lines[:10])
|
||||
|
||||
Now we train the sentence segmenter (or sentence tokenizer) and use it on our test sentences:
|
||||
|
||||
>>> from nltk.tokenize import PunktSentenceTokenizer
|
||||
>>> stok = nltk.PunktSentenceTokenizer(train)
|
||||
>>> print(stok.tokenize(test))
|
||||
['O 7 e Meio \xe9 um ex-libris da noite algarvia.',
|
||||
'\xc9 uma das mais antigas discotecas do Algarve, situada em Albufeira,
|
||||
que continua a manter os tra\xe7os decorativos e as clientelas de sempre.',
|
||||
'\xc9 um pouco a vers\xe3o de uma esp\xe9cie de \xaboutro lado\xbb da noite,
|
||||
a meio caminho entre os devaneios de uma fauna perif\xe9rica, seja de Lisboa,
|
||||
Londres, Dublin ou Faro e Portim\xe3o, e a postura circunspecta dos fi\xe9is da casa,
|
||||
que dela esperam a m\xfasica \xabgeracionista\xbb dos 60 ou dos 70.',
|
||||
'N\xe3o deixa de ser, nos tempos que correm, um certo \xabvery typical\xbb algarvio,
|
||||
cabe\xe7a de cartaz para os que querem fugir a algumas movimenta\xe7\xf5es nocturnas
|
||||
j\xe1 a caminho da ritualiza\xe7\xe3o de massas, do g\xe9nero \xabvamos todos ao
|
||||
Calypso e encontramo-nos na Locomia\xbb.',
|
||||
'E assim, aos 2,5 milh\xf5es que o Minist\xe9rio do Planeamento e Administra\xe7\xe3o
|
||||
do Territ\xf3rio j\xe1 gasta no pagamento do pessoal afecto a estes organismos,
|
||||
v\xeam juntar-se os montantes das obras propriamente ditas, que os munic\xedpios,
|
||||
j\xe1 com projectos na m\xe3o, v\xeam reivindicar junto do Executivo, como salienta
|
||||
aquele membro do Governo.',
|
||||
'E o dinheiro \xabn\xe3o falta s\xf3 \xe0s c\xe2maras\xbb, lembra o secret\xe1rio de Estado,
|
||||
que considera que a solu\xe7\xe3o para as autarquias \xe9 \xabespecializarem-se em
|
||||
fundos comunit\xe1rios\xbb.',
|
||||
'Mas como, se muitas n\xe3o disp\xf5em, nos seus quadros, dos t\xe9cnicos necess\xe1rios?',
|
||||
'\xabEncomendem-nos a projectistas de fora\xbb porque, se as obras vierem a ser financiadas,
|
||||
eles at\xe9 saem de gra\xe7a, j\xe1 que, nesse caso, \xabos fundos comunit\xe1rios pagam
|
||||
os projectos, o mesmo n\xe3o acontecendo quando eles s\xe3o feitos pelos GAT\xbb,
|
||||
dado serem organismos do Estado.',
|
||||
'Essa poder\xe1 vir a ser uma hip\xf3tese, at\xe9 porque, no terreno, a capacidade dos GAT
|
||||
est\xe1 cada vez mais enfraquecida.',
|
||||
'Alguns at\xe9 j\xe1 desapareceram, como o de Castro Verde, e outros t\xeam vindo a perder quadros.']
|
||||
|
||||
NLTK's data collection includes a trained model for Portuguese sentence
|
||||
segmentation, which can be loaded as follows. It is faster to load a trained model than
|
||||
to retrain it.
|
||||
|
||||
>>> from nltk.tokenize import PunktTokenizer
|
||||
>>> stok = PunktTokenizer("portuguese")
|
||||
|
||||
Stemming
|
||||
--------
|
||||
|
||||
NLTK includes the RSLP Portuguese stemmer. Here we use it to stem some Portuguese text:
|
||||
|
||||
>>> stemmer = nltk.stem.RSLPStemmer()
|
||||
>>> stemmer.stem("copiar")
|
||||
'copi'
|
||||
>>> stemmer.stem("paisagem")
|
||||
'pais'
|
||||
|
||||
|
||||
Stopwords
|
||||
---------
|
||||
|
||||
NLTK includes Portuguese stopwords:
|
||||
|
||||
>>> stopwords = nltk.corpus.stopwords.words('portuguese')
|
||||
>>> stopwords[:10]
|
||||
['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9']
|
||||
|
||||
Now we can use these to filter text. Let's find the most frequent words (other than stopwords)
|
||||
and print them in descending order of frequency:
|
||||
|
||||
>>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords)
|
||||
>>> for word in list(fd.keys())[:20]:
|
||||
... print(word, fd[word])
|
||||
, 13444
|
||||
. 7725
|
||||
« 2369
|
||||
» 2310
|
||||
é 1305
|
||||
o 1086
|
||||
} 1047
|
||||
{ 1044
|
||||
a 897
|
||||
; 633
|
||||
em 516
|
||||
ser 466
|
||||
sobre 349
|
||||
os 313
|
||||
anos 301
|
||||
ontem 292
|
||||
ainda 279
|
||||
segundo 256
|
||||
ter 249
|
||||
dois 231
|
||||
@@ -0,0 +1,4 @@
|
||||
def setup_module():
|
||||
import pytest
|
||||
|
||||
pytest.skip("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!")
|
||||
@@ -0,0 +1,306 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===========
|
||||
Probability
|
||||
===========
|
||||
|
||||
>>> from nltk.test.probability_fixt import setup_module
|
||||
>>> setup_module()
|
||||
|
||||
>>> import nltk
|
||||
>>> from nltk.probability import *
|
||||
|
||||
FreqDist
|
||||
--------
|
||||
|
||||
>>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!']
|
||||
>>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.']
|
||||
|
||||
>>> fd1 = nltk.FreqDist(text1)
|
||||
>>> fd1 == nltk.FreqDist(text1)
|
||||
True
|
||||
|
||||
Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order.
|
||||
|
||||
>>> import itertools
|
||||
>>> both = nltk.FreqDist(text1 + text2)
|
||||
>>> both_most_common = both.most_common()
|
||||
>>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1]))))
|
||||
[('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)]
|
||||
|
||||
>>> both == fd1 + nltk.FreqDist(text2)
|
||||
True
|
||||
>>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged
|
||||
True
|
||||
|
||||
>>> fd2 = nltk.FreqDist(text2)
|
||||
>>> fd1.update(fd2)
|
||||
>>> fd1 == both
|
||||
True
|
||||
|
||||
>>> fd1 = nltk.FreqDist(text1)
|
||||
>>> fd1.update(text2)
|
||||
>>> fd1 == both
|
||||
True
|
||||
|
||||
>>> fd1 = nltk.FreqDist(text1)
|
||||
>>> fd2 = nltk.FreqDist(fd1)
|
||||
>>> fd2 == fd1
|
||||
True
|
||||
|
||||
``nltk.FreqDist`` can be pickled:
|
||||
|
||||
>>> import pickle
|
||||
>>> fd1 = nltk.FreqDist(text1)
|
||||
>>> pickled = pickle.dumps(fd1)
|
||||
>>> fd1 == pickle.loads(pickled)
|
||||
True
|
||||
|
||||
Mathematical operations:
|
||||
|
||||
>>> FreqDist('abbb') + FreqDist('bcc')
|
||||
FreqDist({'b': 4, 'c': 2, 'a': 1})
|
||||
>>> FreqDist('abbbc') - FreqDist('bccd')
|
||||
FreqDist({'b': 2, 'a': 1})
|
||||
>>> FreqDist('abbb') | FreqDist('bcc')
|
||||
FreqDist({'b': 3, 'c': 2, 'a': 1})
|
||||
>>> FreqDist('abbb') & FreqDist('bcc')
|
||||
FreqDist({'b': 1})
|
||||
|
||||
ConditionalFreqDist
|
||||
-------------------
|
||||
|
||||
>>> cfd1 = ConditionalFreqDist()
|
||||
>>> cfd1[1] = FreqDist('abbbb')
|
||||
>>> cfd1[2] = FreqDist('xxxxyy')
|
||||
>>> cfd1
|
||||
<ConditionalFreqDist with 2 conditions>
|
||||
|
||||
>>> cfd2 = ConditionalFreqDist()
|
||||
>>> cfd2[1] = FreqDist('bbccc')
|
||||
>>> cfd2[2] = FreqDist('xxxyyyzz')
|
||||
>>> cfd2[3] = FreqDist('m')
|
||||
>>> cfd2
|
||||
<ConditionalFreqDist with 3 conditions>
|
||||
|
||||
>>> r = cfd1 + cfd2
|
||||
>>> [(i,r[i]) for i in r.conditions()]
|
||||
[(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))]
|
||||
|
||||
>>> r = cfd1 - cfd2
|
||||
>>> [(i,r[i]) for i in r.conditions()]
|
||||
[(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))]
|
||||
|
||||
>>> r = cfd1 | cfd2
|
||||
>>> [(i,r[i]) for i in r.conditions()]
|
||||
[(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))]
|
||||
|
||||
>>> r = cfd1 & cfd2
|
||||
>>> [(i,r[i]) for i in r.conditions()]
|
||||
[(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))]
|
||||
|
||||
Testing some HMM estimators
|
||||
---------------------------
|
||||
|
||||
We extract a small part (500 sentences) of the Brown corpus
|
||||
|
||||
>>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
|
||||
>>> print(len(corpus))
|
||||
500
|
||||
|
||||
We create a HMM trainer - note that we need the tags and symbols
|
||||
from the whole corpus, not just the training corpus
|
||||
|
||||
>>> from nltk.util import unique_list
|
||||
>>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
|
||||
>>> print(len(tag_set))
|
||||
92
|
||||
>>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
|
||||
>>> print(len(symbols))
|
||||
1464
|
||||
>>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
|
||||
|
||||
We divide the corpus into 90% training and 10% testing
|
||||
|
||||
>>> train_corpus = []
|
||||
>>> test_corpus = []
|
||||
>>> for i in range(len(corpus)):
|
||||
... if i % 10:
|
||||
... train_corpus += [corpus[i]]
|
||||
... else:
|
||||
... test_corpus += [corpus[i]]
|
||||
>>> print(len(train_corpus))
|
||||
450
|
||||
>>> print(len(test_corpus))
|
||||
50
|
||||
|
||||
And now we can test the estimators
|
||||
|
||||
>>> def train_and_test(est):
|
||||
... hmm = trainer.train_supervised(train_corpus, estimator=est)
|
||||
... print('%.2f%%' % (100 * hmm.accuracy(test_corpus)))
|
||||
|
||||
Maximum Likelihood Estimation
|
||||
-----------------------------
|
||||
- this resulted in an initialization error before r7209
|
||||
|
||||
>>> mle = lambda fd, bins: MLEProbDist(fd)
|
||||
>>> train_and_test(mle)
|
||||
22.75%
|
||||
|
||||
Laplace (= Lidstone with gamma==1)
|
||||
|
||||
>>> train_and_test(LaplaceProbDist)
|
||||
66.04%
|
||||
|
||||
Expected Likelihood Estimation (= Lidstone with gamma==0.5)
|
||||
|
||||
>>> train_and_test(ELEProbDist)
|
||||
73.01%
|
||||
|
||||
Lidstone Estimation, for gamma==0.1, 0.5 and 1
|
||||
(the later two should be exactly equal to MLE and ELE above)
|
||||
|
||||
>>> def lidstone(gamma):
|
||||
... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins)
|
||||
>>> train_and_test(lidstone(0.1))
|
||||
82.51%
|
||||
>>> train_and_test(lidstone(0.5))
|
||||
73.01%
|
||||
>>> train_and_test(lidstone(1.0))
|
||||
66.04%
|
||||
|
||||
Witten Bell Estimation
|
||||
----------------------
|
||||
- This resulted in ZeroDivisionError before r7209
|
||||
|
||||
>>> train_and_test(WittenBellProbDist)
|
||||
88.12%
|
||||
|
||||
Good Turing Estimation
|
||||
|
||||
>>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5)
|
||||
>>> train_and_test(gt)
|
||||
86.93%
|
||||
|
||||
Kneser Ney Estimation
|
||||
---------------------
|
||||
Since the Kneser-Ney distribution is best suited for trigrams, we must adjust
|
||||
our testing accordingly.
|
||||
|
||||
>>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1]))
|
||||
... for x, y, z in nltk.trigrams(sent)]
|
||||
... for sent in corpus[:100]]
|
||||
|
||||
We will then need to redefine the rest of the training/testing variables
|
||||
|
||||
>>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
|
||||
>>> len(tag_set)
|
||||
906
|
||||
|
||||
>>> symbols = unique_list(word for sent in corpus for (word,tag) in sent)
|
||||
>>> len(symbols)
|
||||
1341
|
||||
|
||||
>>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
|
||||
>>> train_corpus = []
|
||||
>>> test_corpus = []
|
||||
|
||||
>>> for i in range(len(corpus)):
|
||||
... if i % 10:
|
||||
... train_corpus += [corpus[i]]
|
||||
... else:
|
||||
... test_corpus += [corpus[i]]
|
||||
|
||||
>>> len(train_corpus)
|
||||
90
|
||||
>>> len(test_corpus)
|
||||
10
|
||||
|
||||
>>> kn = lambda fd, bins: KneserNeyProbDist(fd)
|
||||
>>> train_and_test(kn)
|
||||
0.86%
|
||||
|
||||
Remains to be added:
|
||||
- Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist
|
||||
|
||||
Squashed bugs
|
||||
-------------
|
||||
|
||||
Issue 511: override pop and popitem to invalidate the cache
|
||||
|
||||
>>> fd = nltk.FreqDist('a')
|
||||
>>> list(fd.keys())
|
||||
['a']
|
||||
>>> fd.pop('a')
|
||||
1
|
||||
>>> list(fd.keys())
|
||||
[]
|
||||
|
||||
Issue 533: access cumulative frequencies with no arguments
|
||||
|
||||
>>> fd = nltk.FreqDist('aab')
|
||||
>>> list(fd._cumulative_frequencies(['a']))
|
||||
[2.0]
|
||||
>>> list(fd._cumulative_frequencies(['a', 'b']))
|
||||
[2.0, 3.0]
|
||||
|
||||
Issue 579: override clear to reset some variables
|
||||
|
||||
>>> fd = FreqDist('aab')
|
||||
>>> fd.clear()
|
||||
>>> fd.N()
|
||||
0
|
||||
|
||||
Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently
|
||||
add errant categories
|
||||
|
||||
>>> from nltk.corpus import brown
|
||||
>>> brown.fileids('blah')
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Category blah not found
|
||||
>>> brown.categories()
|
||||
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
|
||||
|
||||
Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default
|
||||
otherwise any unseen events get a probability of zero, i.e.,
|
||||
they don't get smoothed
|
||||
|
||||
>>> from nltk import SimpleGoodTuringProbDist, FreqDist
|
||||
>>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10})
|
||||
>>> p = SimpleGoodTuringProbDist(fd)
|
||||
>>> p.prob('a')
|
||||
0.017649766667026317...
|
||||
>>> p.prob('o')
|
||||
0.0843305021534041...
|
||||
>>> p.prob('z')
|
||||
0.022727272727272728...
|
||||
>>> p.prob('foobar')
|
||||
0.022727272727272728...
|
||||
|
||||
``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and
|
||||
``ConditionalFreqDist`` can be pickled:
|
||||
|
||||
>>> import pickle
|
||||
>>> pd = MLEProbDist(fd)
|
||||
>>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples())
|
||||
True
|
||||
>>> dpd = DictionaryConditionalProbDist({'x': pd})
|
||||
>>> unpickled = pickle.loads(pickle.dumps(dpd))
|
||||
>>> dpd['x'].prob('a')
|
||||
0.011363636...
|
||||
>>> dpd['x'].prob('a') == unpickled['x'].prob('a')
|
||||
True
|
||||
>>> cfd = nltk.probability.ConditionalFreqDist()
|
||||
>>> cfd['foo']['hello'] += 1
|
||||
>>> cfd['foo']['hello'] += 1
|
||||
>>> cfd['bar']['hello'] += 1
|
||||
>>> cfd2 = pickle.loads(pickle.dumps(cfd))
|
||||
>>> cfd2 == cfd
|
||||
True
|
||||
>>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist)
|
||||
>>> cpd2 = pickle.loads(pickle.dumps(cpd))
|
||||
>>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello')
|
||||
True
|
||||
@@ -0,0 +1,8 @@
|
||||
# probability.doctest uses HMM which requires numpy;
|
||||
# skip probability.doctest if numpy is not available
|
||||
|
||||
|
||||
def setup_module():
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("numpy")
|
||||
@@ -0,0 +1,176 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
========
|
||||
PropBank
|
||||
========
|
||||
|
||||
The PropBank Corpus provides predicate-argument annotation for the
|
||||
entire Penn Treebank. Each verb in the treebank is annotated by a single
|
||||
instance in PropBank, containing information about the location of
|
||||
the verb, and the location and identity of its arguments:
|
||||
|
||||
>>> from nltk.corpus import propbank
|
||||
>>> pb_instances = propbank.instances()
|
||||
>>> print(pb_instances)
|
||||
[<PropbankInstance: wsj_0001.mrg, sent 0, word 8>,
|
||||
<PropbankInstance: wsj_0001.mrg, sent 1, word 10>, ...]
|
||||
|
||||
Each propbank instance defines the following member variables:
|
||||
|
||||
- Location information: `fileid`, `sentnum`, `wordnum`
|
||||
- Annotator information: `tagger`
|
||||
- Inflection information: `inflection`
|
||||
- Roleset identifier: `roleset`
|
||||
- Verb (aka predicate) location: `predicate`
|
||||
- Argument locations and types: `arguments`
|
||||
|
||||
The following examples show the types of these arguments:
|
||||
|
||||
>>> inst = pb_instances[103]
|
||||
>>> (inst.fileid, inst.sentnum, inst.wordnum)
|
||||
('wsj_0004.mrg', 8, 16)
|
||||
>>> inst.tagger
|
||||
'gold'
|
||||
>>> inst.inflection
|
||||
<PropbankInflection: vp--a>
|
||||
>>> infl = inst.inflection
|
||||
>>> infl.form, infl.tense, infl.aspect, infl.person, infl.voice
|
||||
('v', 'p', '-', '-', 'a')
|
||||
>>> inst.roleset
|
||||
'rise.01'
|
||||
>>> inst.predicate
|
||||
PropbankTreePointer(16, 0)
|
||||
>>> inst.arguments
|
||||
((PropbankTreePointer(0, 2), 'ARG1'),
|
||||
(PropbankTreePointer(13, 1), 'ARGM-DIS'),
|
||||
(PropbankTreePointer(17, 1), 'ARG4-to'),
|
||||
(PropbankTreePointer(20, 1), 'ARG3-from'))
|
||||
|
||||
The location of the predicate and of the arguments are encoded using
|
||||
`PropbankTreePointer` objects, as well as `PropbankChainTreePointer`
|
||||
objects and `PropbankSplitTreePointer` objects. A
|
||||
`PropbankTreePointer` consists of a `wordnum` and a `height`:
|
||||
|
||||
>>> print(inst.predicate.wordnum, inst.predicate.height)
|
||||
16 0
|
||||
|
||||
This identifies the tree constituent that is headed by the word that
|
||||
is the `wordnum`\ 'th token in the sentence, and whose span is found
|
||||
by going `height` nodes up in the tree. This type of pointer is only
|
||||
useful if we also have the corresponding tree structure, since it
|
||||
includes empty elements such as traces in the word number count. The
|
||||
trees for 10% of the standard PropBank Corpus are contained in the
|
||||
`treebank` corpus:
|
||||
|
||||
>>> tree = inst.tree
|
||||
|
||||
>>> from nltk.corpus import treebank
|
||||
>>> assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum]
|
||||
|
||||
>>> inst.predicate.select(tree)
|
||||
Tree('VBD', ['rose'])
|
||||
>>> for (argloc, argid) in inst.arguments:
|
||||
... print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50]))
|
||||
ARG1 (NP-SBJ (NP (DT The) (NN yield)) (PP (IN on) (NP (
|
||||
ARGM-DIS (PP (IN for) (NP (NN example)))
|
||||
ARG4-to (PP-DIR (TO to) (NP (CD 8.04) (NN %)))
|
||||
ARG3-from (PP-DIR (IN from) (NP (CD 7.90) (NN %)))
|
||||
|
||||
Propbank tree pointers can be converted to standard tree locations,
|
||||
which are usually easier to work with, using the `treepos()` method:
|
||||
|
||||
>>> treepos = inst.predicate.treepos(tree)
|
||||
>>> print (treepos, tree[treepos])
|
||||
(4, 0) (VBD rose)
|
||||
|
||||
In some cases, argument locations will be encoded using
|
||||
`PropbankChainTreePointer`\ s (for trace chains) or
|
||||
`PropbankSplitTreePointer`\ s (for discontinuous constituents). Both
|
||||
of these objects contain a single member variable, `pieces`,
|
||||
containing a list of the constituent pieces. They also define the
|
||||
method `select()`, which will return a tree containing all the
|
||||
elements of the argument. (A new head node is created, labeled
|
||||
"*CHAIN*" or "*SPLIT*", since the argument is not a single constituent
|
||||
in the original tree). Sentence #6 contains an example of an argument
|
||||
that is both discontinuous and contains a chain:
|
||||
|
||||
>>> inst = pb_instances[6]
|
||||
>>> inst.roleset
|
||||
'expose.01'
|
||||
>>> argloc, argid = inst.arguments[2]
|
||||
>>> argloc
|
||||
<PropbankChainTreePointer: 22:1,24:0,25:1*27:0>
|
||||
>>> argloc.pieces
|
||||
[<PropbankSplitTreePointer: 22:1,24:0,25:1>, PropbankTreePointer(27, 0)]
|
||||
>>> argloc.pieces[0].pieces
|
||||
...
|
||||
[PropbankTreePointer(22, 1), PropbankTreePointer(24, 0),
|
||||
PropbankTreePointer(25, 1)]
|
||||
>>> print(argloc.select(inst.tree))
|
||||
(*CHAIN*
|
||||
(*SPLIT* (NP (DT a) (NN group)) (IN of) (NP (NNS workers)))
|
||||
(-NONE- *))
|
||||
|
||||
The PropBank Corpus also provides access to the frameset files, which
|
||||
define the argument labels used by the annotations, on a per-verb
|
||||
basis. Each frameset file contains one or more predicates, such as
|
||||
'turn' or 'turn_on', each of which is divided into coarse-grained word
|
||||
senses called rolesets. For each roleset, the frameset file provides
|
||||
descriptions of the argument roles, along with examples.
|
||||
|
||||
>>> expose_01 = propbank.roleset('expose.01')
|
||||
>>> turn_01 = propbank.roleset('turn.01')
|
||||
>>> print(turn_01)
|
||||
<Element 'roleset' at ...>
|
||||
>>> for role in turn_01.findall("roles/role"):
|
||||
... print(role.attrib['n'], role.attrib['descr'])
|
||||
0 turner
|
||||
1 thing turning
|
||||
m direction, location
|
||||
|
||||
>>> from xml.etree import ElementTree
|
||||
>>> print(ElementTree.tostring(turn_01.find('example')).decode('utf8').strip())
|
||||
<example name="transitive agentive">
|
||||
<text>
|
||||
John turned the key in the lock.
|
||||
</text>
|
||||
<arg n="0">John</arg>
|
||||
<rel>turned</rel>
|
||||
<arg n="1">the key</arg>
|
||||
<arg f="LOC" n="m">in the lock</arg>
|
||||
</example>
|
||||
|
||||
Note that the standard corpus distribution only contains 10% of the
|
||||
treebank, so the parse trees are not available for instances starting
|
||||
at 9353:
|
||||
|
||||
>>> inst = pb_instances[9352]
|
||||
>>> inst.fileid
|
||||
'wsj_0199.mrg'
|
||||
>>> print(inst.tree)
|
||||
(S (NP-SBJ (NNP Trinity)) (VP (VBD said) (SBAR (-NONE- 0) ...))
|
||||
>>> print(inst.predicate.select(inst.tree))
|
||||
(VB begin)
|
||||
|
||||
>>> inst = pb_instances[9353]
|
||||
>>> inst.fileid
|
||||
'wsj_0200.mrg'
|
||||
>>> print(inst.tree)
|
||||
None
|
||||
>>> print(inst.predicate.select(inst.tree))
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Parse tree not available
|
||||
|
||||
However, if you supply your own version of the treebank corpus (by
|
||||
putting it before the nltk-provided version on `nltk.data.path`, or
|
||||
by creating a `ptb` directory as described above and using the
|
||||
`propbank_ptb` module), then you can access the trees for all
|
||||
instances.
|
||||
|
||||
A list of the verb lemmas contained in PropBank is returned by the
|
||||
`propbank.verbs()` method:
|
||||
|
||||
>>> propbank.verbs()
|
||||
['abandon', 'abate', 'abdicate', 'abet', 'abide', ...]
|
||||
@@ -0,0 +1,263 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
======================
|
||||
Information Extraction
|
||||
======================
|
||||
|
||||
Information Extraction standardly consists of three subtasks:
|
||||
|
||||
#. Named Entity Recognition
|
||||
|
||||
#. Relation Extraction
|
||||
|
||||
#. Template Filling
|
||||
|
||||
Named Entities
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
The IEER corpus is marked up for a variety of Named Entities. A Named
|
||||
Entity (more strictly, a Named Entity mention) is a name of an
|
||||
entity belonging to a specified class. For example, the Named Entity
|
||||
classes in IEER include PERSON, LOCATION, ORGANIZATION, DATE and so
|
||||
on. Within NLTK, Named Entities are represented as subtrees within a
|
||||
chunk structure: the class name is treated as node label, while the
|
||||
entity mention itself appears as the leaves of the subtree. This is
|
||||
illustrated below, where we have show an extract of the chunk
|
||||
representation of document NYT_19980315.064:
|
||||
|
||||
>>> from nltk.corpus import ieer
|
||||
>>> docs = ieer.parsed_docs('NYT_19980315')
|
||||
>>> tree = docs[1].text
|
||||
>>> print(tree)
|
||||
(DOCUMENT
|
||||
...
|
||||
``It's
|
||||
a
|
||||
chance
|
||||
to
|
||||
think
|
||||
about
|
||||
first-level
|
||||
questions,''
|
||||
said
|
||||
Ms.
|
||||
(PERSON Cohn)
|
||||
,
|
||||
a
|
||||
partner
|
||||
in
|
||||
the
|
||||
(ORGANIZATION McGlashan & Sarrail)
|
||||
firm
|
||||
in
|
||||
(LOCATION San Mateo)
|
||||
,
|
||||
(LOCATION Calif.)
|
||||
...)
|
||||
|
||||
Thus, the Named Entity mentions in this example are *Cohn*, *McGlashan &
|
||||
Sarrail*, *San Mateo* and *Calif.*.
|
||||
|
||||
The CoNLL2002 Dutch and Spanish data is treated similarly, although in
|
||||
this case, the strings are also POS tagged.
|
||||
|
||||
>>> from nltk.corpus import conll2002
|
||||
>>> for doc in conll2002.chunked_sents('ned.train')[27]:
|
||||
... print(doc)
|
||||
('Het', 'Art')
|
||||
(ORG Hof/N van/Prep Cassatie/N)
|
||||
('verbrak', 'V')
|
||||
('het', 'Art')
|
||||
('arrest', 'N')
|
||||
('zodat', 'Conj')
|
||||
('het', 'Pron')
|
||||
('moest', 'V')
|
||||
('worden', 'V')
|
||||
('overgedaan', 'V')
|
||||
('door', 'Prep')
|
||||
('het', 'Art')
|
||||
('hof', 'N')
|
||||
('van', 'Prep')
|
||||
('beroep', 'N')
|
||||
('van', 'Prep')
|
||||
(LOC Antwerpen/N)
|
||||
('.', 'Punc')
|
||||
|
||||
Relation Extraction
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Relation Extraction standardly consists of identifying specified
|
||||
relations between Named Entities. For example, assuming that we can
|
||||
recognize ORGANIZATIONs and LOCATIONs in text, we might want to also
|
||||
recognize pairs *(o, l)* of these kinds of entities such that *o* is
|
||||
located in *l*.
|
||||
|
||||
The `sem.relextract` module provides some tools to help carry out a
|
||||
simple version of this task. The `tree2semi_rel()` function splits a chunk
|
||||
document into a list of two-member lists, each of which consists of a
|
||||
(possibly empty) string followed by a `Tree` (i.e., a Named Entity):
|
||||
|
||||
>>> from nltk.sem import relextract
|
||||
>>> pairs = relextract.tree2semi_rel(tree)
|
||||
>>> for s, tree in pairs[18:22]:
|
||||
... print('("...%s", %s)' % (" ".join(s[-5:]),tree))
|
||||
("...about first-level questions,'' said Ms.", (PERSON Cohn))
|
||||
("..., a partner in the", (ORGANIZATION McGlashan & Sarrail))
|
||||
("...firm in", (LOCATION San Mateo))
|
||||
("...,", (LOCATION Calif.))
|
||||
|
||||
The function `semi_rel2reldict()` processes triples of these pairs, i.e.,
|
||||
pairs of the form ``((string1, Tree1), (string2, Tree2), (string3,
|
||||
Tree3))`` and outputs a dictionary (a `reldict`) in which ``Tree1`` is
|
||||
the subject of the relation, ``string2`` is the filler
|
||||
and ``Tree3`` is the object of the relation. ``string1`` and ``string3`` are
|
||||
stored as left and right context respectively.
|
||||
|
||||
>>> reldicts = relextract.semi_rel2reldict(pairs)
|
||||
>>> for k, v in sorted(reldicts[0].items()):
|
||||
... print(k, '=>', v)
|
||||
filler => of messages to their own ``Cyberia'' ...
|
||||
lcon => transactions.'' Each week, they post
|
||||
objclass => ORGANIZATION
|
||||
objsym => white_house
|
||||
objtext => White House
|
||||
rcon => for access to its planned
|
||||
subjclass => CARDINAL
|
||||
subjsym => hundreds
|
||||
subjtext => hundreds
|
||||
untagged_filler => of messages to their own ``Cyberia'' ...
|
||||
|
||||
The next example shows some of the values for two `reldict`\ s
|
||||
corresponding to the ``'NYT_19980315'`` text extract shown earlier.
|
||||
|
||||
>>> for r in reldicts[18:20]:
|
||||
... print('=' * 20)
|
||||
... print(r['subjtext'])
|
||||
... print(r['filler'])
|
||||
... print(r['objtext'])
|
||||
====================
|
||||
Cohn
|
||||
, a partner in the
|
||||
McGlashan & Sarrail
|
||||
====================
|
||||
McGlashan & Sarrail
|
||||
firm in
|
||||
San Mateo
|
||||
|
||||
The function `relextract()` allows us to filter the `reldict`\ s
|
||||
according to the classes of the subject and object named entities. In
|
||||
addition, we can specify that the filler text has to match a given
|
||||
regular expression, as illustrated in the next example. Here, we are
|
||||
looking for pairs of entities in the IN relation, where IN has
|
||||
signature <ORG, LOC>.
|
||||
|
||||
>>> import re
|
||||
>>> IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
|
||||
>>> for fileid in ieer.fileids():
|
||||
... for doc in ieer.parsed_docs(fileid):
|
||||
... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
|
||||
... print(relextract.rtuple(rel))
|
||||
[ORG: 'Christian Democrats'] ', the leading political forces in' [LOC: 'Italy']
|
||||
[ORG: 'AP'] ') _ Lebanese guerrillas attacked Israeli forces in southern' [LOC: 'Lebanon']
|
||||
[ORG: 'Security Council'] 'adopted Resolution 425. Huge yellow banners hung across intersections in' [LOC: 'Beirut']
|
||||
[ORG: 'U.N.'] 'failures in' [LOC: 'Africa']
|
||||
[ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia']
|
||||
[ORG: 'U.N.'] 'partners on a more effective role in' [LOC: 'Africa']
|
||||
[ORG: 'AP'] ') _ A bomb exploded in a mosque in central' [LOC: 'San`a']
|
||||
[ORG: 'Krasnoye Sormovo'] 'shipyard in the Soviet city of' [LOC: 'Gorky']
|
||||
[ORG: 'Kelab Golf Darul Ridzuan'] 'in' [LOC: 'Perak']
|
||||
[ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia']
|
||||
[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
|
||||
[ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo']
|
||||
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
|
||||
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
|
||||
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
|
||||
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
|
||||
...
|
||||
|
||||
The next example illustrates a case where the pattern is a disjunction
|
||||
of roles that a PERSON can occupy in an ORGANIZATION.
|
||||
|
||||
>>> roles = r"""
|
||||
... (.*(
|
||||
... analyst|
|
||||
... chair(wo)?man|
|
||||
... commissioner|
|
||||
... counsel|
|
||||
... director|
|
||||
... economist|
|
||||
... editor|
|
||||
... executive|
|
||||
... foreman|
|
||||
... governor|
|
||||
... head|
|
||||
... lawyer|
|
||||
... leader|
|
||||
... librarian).*)|
|
||||
... manager|
|
||||
... partner|
|
||||
... president|
|
||||
... producer|
|
||||
... professor|
|
||||
... researcher|
|
||||
... spokes(wo)?man|
|
||||
... writer|
|
||||
... ,\sof\sthe?\s* # "X, of (the) Y"
|
||||
... """
|
||||
>>> ROLES = re.compile(roles, re.VERBOSE)
|
||||
>>> for fileid in ieer.fileids():
|
||||
... for doc in ieer.parsed_docs(fileid):
|
||||
... for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
|
||||
... print(relextract.rtuple(rel))
|
||||
[PER: 'Kivutha Kibwana'] ', of the' [ORG: 'National Convention Assembly']
|
||||
[PER: 'Boban Boskovic'] ', chief executive of the' [ORG: 'Plastika']
|
||||
[PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations']
|
||||
[PER: 'Kiriyenko'] 'became a foreman at the' [ORG: 'Krasnoye Sormovo']
|
||||
[PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations']
|
||||
[PER: 'Mike Godwin'] ', chief counsel for the' [ORG: 'Electronic Frontier Foundation']
|
||||
...
|
||||
|
||||
In the case of the CoNLL2002 data, we can include POS tags in the
|
||||
query pattern. This example also illustrates how the output can be
|
||||
presented as something that looks more like a clause in a logical language.
|
||||
|
||||
>>> de = """
|
||||
... .*
|
||||
... (
|
||||
... de/SP|
|
||||
... del/SP
|
||||
... )
|
||||
... """
|
||||
>>> DE = re.compile(de, re.VERBOSE)
|
||||
>>> rels = [rel for doc in conll2002.chunked_sents('esp.train')
|
||||
... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
|
||||
>>> for r in rels[:10]:
|
||||
... print(relextract.clause(r, relsym='DE'))
|
||||
DE('tribunal_supremo', 'victoria')
|
||||
DE('museo_de_arte', 'alcorc\xf3n')
|
||||
DE('museo_de_bellas_artes', 'a_coru\xf1a')
|
||||
DE('siria', 'l\xedbano')
|
||||
DE('uni\xf3n_europea', 'pek\xedn')
|
||||
DE('ej\xe9rcito', 'rogberi')
|
||||
DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n')
|
||||
DE('psoe', 'villanueva_de_la_serena')
|
||||
DE('ej\xe9rcito', 'l\xedbano')
|
||||
DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta')
|
||||
>>> vnv = """
|
||||
... (
|
||||
... is/V|
|
||||
... was/V|
|
||||
... werd/V|
|
||||
... wordt/V
|
||||
... )
|
||||
... .*
|
||||
... van/Prep
|
||||
... """
|
||||
>>> VAN = re.compile(vnv, re.VERBOSE)
|
||||
>>> for doc in conll2002.chunked_sents('ned.train'):
|
||||
... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
|
||||
... print(relextract.clause(r, relsym="VAN"))
|
||||
VAN("cornet_d'elzius", 'buitenlandse_handel')
|
||||
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
|
||||
VAN('annie_lennox', 'eurythmics')
|
||||
@@ -0,0 +1,222 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=========================
|
||||
Resolution Theorem Prover
|
||||
=========================
|
||||
|
||||
>>> from nltk.inference.resolution import *
|
||||
>>> from nltk.sem import logic
|
||||
>>> from nltk.sem.logic import *
|
||||
>>> logic._counter._value = 0
|
||||
>>> read_expr = logic.Expression.fromstring
|
||||
|
||||
>>> P = read_expr('P')
|
||||
>>> Q = read_expr('Q')
|
||||
>>> R = read_expr('R')
|
||||
>>> A = read_expr('A')
|
||||
>>> B = read_expr('B')
|
||||
>>> x = read_expr('x')
|
||||
>>> y = read_expr('y')
|
||||
>>> z = read_expr('z')
|
||||
|
||||
-------------------------------
|
||||
Test most_general_unification()
|
||||
-------------------------------
|
||||
>>> print(most_general_unification(x, x))
|
||||
{}
|
||||
>>> print(most_general_unification(A, A))
|
||||
{}
|
||||
>>> print(most_general_unification(A, x))
|
||||
{x: A}
|
||||
>>> print(most_general_unification(x, A))
|
||||
{x: A}
|
||||
>>> print(most_general_unification(x, y))
|
||||
{x: y}
|
||||
>>> print(most_general_unification(P(x), P(A)))
|
||||
{x: A}
|
||||
>>> print(most_general_unification(P(x,B), P(A,y)))
|
||||
{x: A, y: B}
|
||||
>>> print(most_general_unification(P(x,B), P(B,x)))
|
||||
{x: B}
|
||||
>>> print(most_general_unification(P(x,y), P(A,x)))
|
||||
{x: A, y: x}
|
||||
>>> print(most_general_unification(P(Q(x)), P(y)))
|
||||
{y: Q(x)}
|
||||
|
||||
------------
|
||||
Test unify()
|
||||
------------
|
||||
>>> print(Clause([]).unify(Clause([])))
|
||||
[]
|
||||
>>> print(Clause([P(x)]).unify(Clause([-P(A)])))
|
||||
[{}]
|
||||
>>> print(Clause([P(A), Q(x)]).unify(Clause([-P(x), R(x)])))
|
||||
[{R(A), Q(A)}]
|
||||
>>> print(Clause([P(A), Q(x), R(x,y)]).unify(Clause([-P(x), Q(y)])))
|
||||
[{Q(y), Q(A), R(A,y)}]
|
||||
>>> print(Clause([P(A), -Q(y)]).unify(Clause([-P(x), Q(B)])))
|
||||
[{}]
|
||||
>>> print(Clause([P(x), Q(x)]).unify(Clause([-P(A), -Q(B)])))
|
||||
[{-Q(B), Q(A)}, {-P(A), P(B)}]
|
||||
>>> print(Clause([P(x,x), Q(x), R(x)]).unify(Clause([-P(A,z), -Q(B)])))
|
||||
[{-Q(B), Q(A), R(A)}, {-P(A,z), R(B), P(B,B)}]
|
||||
|
||||
>>> a = clausify(read_expr('P(A)'))
|
||||
>>> b = clausify(read_expr('A=B'))
|
||||
>>> print(a[0].unify(b[0]))
|
||||
[{P(B)}]
|
||||
|
||||
-------------------------
|
||||
Test is_tautology()
|
||||
-------------------------
|
||||
>>> print(Clause([P(A), -P(A)]).is_tautology())
|
||||
True
|
||||
>>> print(Clause([-P(A), P(A)]).is_tautology())
|
||||
True
|
||||
>>> print(Clause([P(x), -P(A)]).is_tautology())
|
||||
False
|
||||
>>> print(Clause([Q(B), -P(A), P(A)]).is_tautology())
|
||||
True
|
||||
>>> print(Clause([-Q(A), P(R(A)), -P(R(A)), Q(x), -R(y)]).is_tautology())
|
||||
True
|
||||
>>> print(Clause([P(x), -Q(A)]).is_tautology())
|
||||
False
|
||||
|
||||
-------------------------
|
||||
Test subsumes()
|
||||
-------------------------
|
||||
>>> print(Clause([P(A), Q(B)]).subsumes(Clause([P(A), Q(B)])))
|
||||
True
|
||||
>>> print(Clause([-P(A)]).subsumes(Clause([P(A)])))
|
||||
False
|
||||
>>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), P(A)])))
|
||||
True
|
||||
>>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), R(A), P(A)])))
|
||||
True
|
||||
>>> print(Clause([P(A), R(A), Q(B)]).subsumes(Clause([Q(B), P(A)])))
|
||||
False
|
||||
>>> print(Clause([P(x)]).subsumes(Clause([P(A)])))
|
||||
True
|
||||
>>> print(Clause([P(A)]).subsumes(Clause([P(x)])))
|
||||
True
|
||||
|
||||
------------
|
||||
Test prove()
|
||||
------------
|
||||
>>> print(ResolutionProverCommand(read_expr('man(x)')).prove())
|
||||
False
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) -> --man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('(man(x) <-> man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('-(man(x) <-> -man(x))')).prove())
|
||||
True
|
||||
>>> print(ResolutionProverCommand(read_expr('all x.man(x)')).prove())
|
||||
False
|
||||
>>> print(ResolutionProverCommand(read_expr('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')).prove())
|
||||
False
|
||||
>>> print(ResolutionProverCommand(read_expr('some x.all y.sees(x,y)')).prove())
|
||||
False
|
||||
|
||||
>>> p1 = read_expr('all x.(man(x) -> mortal(x))')
|
||||
>>> p2 = read_expr('man(Socrates)')
|
||||
>>> c = read_expr('mortal(Socrates)')
|
||||
>>> ResolutionProverCommand(c, [p1,p2]).prove()
|
||||
True
|
||||
|
||||
>>> p1 = read_expr('all x.(man(x) -> walks(x))')
|
||||
>>> p2 = read_expr('man(John)')
|
||||
>>> c = read_expr('some y.walks(y)')
|
||||
>>> ResolutionProverCommand(c, [p1,p2]).prove()
|
||||
True
|
||||
|
||||
>>> p = read_expr('some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))')
|
||||
>>> c = read_expr('some e0.walk(e0,mary)')
|
||||
>>> ResolutionProverCommand(c, [p]).prove()
|
||||
True
|
||||
|
||||
------------
|
||||
Test proof()
|
||||
------------
|
||||
>>> p1 = read_expr('all x.(man(x) -> mortal(x))')
|
||||
>>> p2 = read_expr('man(Socrates)')
|
||||
>>> c = read_expr('mortal(Socrates)')
|
||||
>>> logic._counter._value = 0
|
||||
>>> tp = ResolutionProverCommand(c, [p1,p2])
|
||||
>>> tp.prove()
|
||||
True
|
||||
>>> print(tp.proof())
|
||||
[1] {-mortal(Socrates)} A
|
||||
[2] {-man(z2), mortal(z2)} A
|
||||
[3] {man(Socrates)} A
|
||||
[4] {-man(Socrates)} (1, 2)
|
||||
[5] {mortal(Socrates)} (2, 3)
|
||||
[6] {} (1, 5)
|
||||
<BLANKLINE>
|
||||
|
||||
------------------
|
||||
Question Answering
|
||||
------------------
|
||||
One answer
|
||||
|
||||
>>> p1 = read_expr('father_of(art,john)')
|
||||
>>> p2 = read_expr('father_of(bob,kim)')
|
||||
>>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))')
|
||||
>>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))')
|
||||
>>> logic._counter._value = 0
|
||||
>>> tp = ResolutionProverCommand(None, [p1,p2,p3,c])
|
||||
>>> sorted(tp.find_answers())
|
||||
[<ConstantExpression art>]
|
||||
>>> print(tp.proof()) # doctest: +SKIP
|
||||
[1] {father_of(art,john)} A
|
||||
[2] {father_of(bob,kim)} A
|
||||
[3] {-father_of(z3,z4), parent_of(z3,z4)} A
|
||||
[4] {-parent_of(z6,john), ANSWER(z6)} A
|
||||
[5] {parent_of(art,john)} (1, 3)
|
||||
[6] {parent_of(bob,kim)} (2, 3)
|
||||
[7] {ANSWER(z6), -father_of(z6,john)} (3, 4)
|
||||
[8] {ANSWER(art)} (1, 7)
|
||||
[9] {ANSWER(art)} (4, 5)
|
||||
<BLANKLINE>
|
||||
|
||||
Multiple answers
|
||||
|
||||
>>> p1 = read_expr('father_of(art,john)')
|
||||
>>> p2 = read_expr('mother_of(ann,john)')
|
||||
>>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))')
|
||||
>>> p4 = read_expr('all x.all y.(mother_of(x,y) -> parent_of(x,y))')
|
||||
>>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))')
|
||||
>>> logic._counter._value = 0
|
||||
>>> tp = ResolutionProverCommand(None, [p1,p2,p3,p4,c])
|
||||
>>> sorted(tp.find_answers())
|
||||
[<ConstantExpression ann>, <ConstantExpression art>]
|
||||
>>> print(tp.proof()) # doctest: +SKIP
|
||||
[ 1] {father_of(art,john)} A
|
||||
[ 2] {mother_of(ann,john)} A
|
||||
[ 3] {-father_of(z3,z4), parent_of(z3,z4)} A
|
||||
[ 4] {-mother_of(z7,z8), parent_of(z7,z8)} A
|
||||
[ 5] {-parent_of(z10,john), ANSWER(z10)} A
|
||||
[ 6] {parent_of(art,john)} (1, 3)
|
||||
[ 7] {parent_of(ann,john)} (2, 4)
|
||||
[ 8] {ANSWER(z10), -father_of(z10,john)} (3, 5)
|
||||
[ 9] {ANSWER(art)} (1, 8)
|
||||
[10] {ANSWER(z10), -mother_of(z10,john)} (4, 5)
|
||||
[11] {ANSWER(ann)} (2, 10)
|
||||
[12] {ANSWER(art)} (5, 6)
|
||||
[13] {ANSWER(ann)} (5, 7)
|
||||
<BLANKLINE>
|
||||
@@ -0,0 +1,667 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=========
|
||||
Semantics
|
||||
=========
|
||||
|
||||
>>> # Setup tests by setting the counter to 0
|
||||
>>> from nltk.sem import logic
|
||||
>>> logic._counter._value = 0
|
||||
|
||||
>>> import nltk
|
||||
>>> from nltk.sem import Valuation, Model
|
||||
>>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),
|
||||
... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])),
|
||||
... ('dog', set(['d1'])),
|
||||
... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
|
||||
>>> val = Valuation(v)
|
||||
>>> dom = val.domain
|
||||
>>> m = Model(dom, val)
|
||||
|
||||
Evaluation
|
||||
----------
|
||||
|
||||
The top-level method of a ``Model`` instance is ``evaluate()``, which
|
||||
assigns a semantic value to expressions of the ``logic`` module, under
|
||||
an assignment ``g``:
|
||||
|
||||
>>> dom = val.domain
|
||||
>>> g = nltk.sem.Assignment(dom)
|
||||
>>> m.evaluate('all x.(boy(x) -> - girl(x))', g)
|
||||
True
|
||||
|
||||
|
||||
``evaluate()`` calls a recursive function ``satisfy()``, which in turn
|
||||
calls a function ``i()`` to interpret non-logical constants and
|
||||
individual variables. ``i()`` delegates the interpretation of these to
|
||||
the the model's ``Valuation`` and the variable assignment ``g``
|
||||
respectively. Any atomic expression which cannot be assigned a value
|
||||
by ``i`` raises an ``Undefined`` exception; this is caught by
|
||||
``evaluate``, which returns the string ``'Undefined'``.
|
||||
|
||||
>>> m.evaluate('walk(adam)', g, trace=2)
|
||||
<BLANKLINE>
|
||||
'walk(adam)' is undefined under M, g
|
||||
'Undefined'
|
||||
|
||||
Batch Processing
|
||||
----------------
|
||||
|
||||
The utility functions ``interpret_sents()`` and ``evaluate_sents()`` are intended to
|
||||
help with processing multiple sentences. Here's an example of the first of these:
|
||||
|
||||
>>> sents = ['Mary walks']
|
||||
>>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg')
|
||||
>>> for result in results:
|
||||
... for (synrep, semrep) in result:
|
||||
... print(synrep)
|
||||
(S[SEM=<walk(mary)>]
|
||||
(NP[-LOC, NUM='sg', SEM=<\P.P(mary)>]
|
||||
(PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary))
|
||||
(VP[NUM='sg', SEM=<\x.walk(x)>]
|
||||
(IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks)))
|
||||
|
||||
In order to provide backwards compatibility with 'legacy' grammars where the semantics value
|
||||
is specified with a lowercase
|
||||
``sem`` feature, the relevant feature name can be passed to the function using the
|
||||
``semkey`` parameter, as shown here:
|
||||
|
||||
>>> sents = ['raining']
|
||||
>>> g = nltk.grammar.FeatureGrammar.fromstring("""
|
||||
... % start S
|
||||
... S[sem=<raining>] -> 'raining'
|
||||
... """)
|
||||
>>> results = nltk.sem.util.interpret_sents(sents, g, semkey='sem')
|
||||
>>> for result in results:
|
||||
... for (synrep, semrep) in result:
|
||||
... print(semrep)
|
||||
raining
|
||||
|
||||
The function ``evaluate_sents()`` works in a similar manner, but also needs to be
|
||||
passed a ``Model`` against which the semantic representations are evaluated.
|
||||
|
||||
Unit Tests
|
||||
==========
|
||||
|
||||
|
||||
Unit tests for relations and valuations
|
||||
---------------------------------------
|
||||
|
||||
>>> from nltk.sem import *
|
||||
|
||||
Relations are sets of tuples, all of the same length.
|
||||
|
||||
>>> s1 = set([('d1', 'd2'), ('d1', 'd1'), ('d2', 'd1')])
|
||||
>>> is_rel(s1)
|
||||
True
|
||||
>>> s2 = set([('d1', 'd2'), ('d1', 'd2'), ('d1',)])
|
||||
>>> is_rel(s2)
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Set set([('d1', 'd2'), ('d1',)]) contains sequences of different lengths
|
||||
>>> s3 = set(['d1', 'd2'])
|
||||
>>> is_rel(s3)
|
||||
Traceback (most recent call last):
|
||||
. . .
|
||||
ValueError: Set set(['d2', 'd1']) contains sequences of different lengths
|
||||
>>> s4 = set2rel(s3)
|
||||
>>> is_rel(s4)
|
||||
True
|
||||
>>> is_rel(set())
|
||||
True
|
||||
>>> null_binary_rel = set([(None, None)])
|
||||
>>> is_rel(null_binary_rel)
|
||||
True
|
||||
|
||||
Sets of entities are converted into sets of singleton tuples
|
||||
(containing strings).
|
||||
|
||||
>>> sorted(set2rel(s3))
|
||||
[('d1',), ('d2',)]
|
||||
>>> sorted(set2rel(set([1,3,5,])))
|
||||
['1', '3', '5']
|
||||
>>> set2rel(set()) == set()
|
||||
True
|
||||
>>> set2rel(set2rel(s3)) == set2rel(s3)
|
||||
True
|
||||
|
||||
Predication is evaluated by set membership.
|
||||
|
||||
>>> ('d1', 'd2') in s1
|
||||
True
|
||||
>>> ('d2', 'd2') in s1
|
||||
False
|
||||
>>> ('d1',) in s1
|
||||
False
|
||||
>>> 'd2' in s1
|
||||
False
|
||||
>>> ('d1',) in s4
|
||||
True
|
||||
>>> ('d1',) in set()
|
||||
False
|
||||
>>> 'd1' in null_binary_rel
|
||||
False
|
||||
|
||||
|
||||
>>> val = Valuation([('Fido', 'd1'), ('dog', set(['d1', 'd2'])), ('walk', set())])
|
||||
>>> sorted(val['dog'])
|
||||
[('d1',), ('d2',)]
|
||||
>>> val.domain == set(['d1', 'd2'])
|
||||
True
|
||||
>>> print(val.symbols)
|
||||
['Fido', 'dog', 'walk']
|
||||
|
||||
|
||||
Parse a valuation from a string.
|
||||
|
||||
>>> v = """
|
||||
... john => b1
|
||||
... mary => g1
|
||||
... suzie => g2
|
||||
... fido => d1
|
||||
... tess => d2
|
||||
... noosa => n
|
||||
... girl => {g1, g2}
|
||||
... boy => {b1, b2}
|
||||
... dog => {d1, d2}
|
||||
... bark => {d1, d2}
|
||||
... walk => {b1, g2, d1}
|
||||
... chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)}
|
||||
... see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)}
|
||||
... in => {(b1, n), (b2, n), (d2, n)}
|
||||
... with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)}
|
||||
... """
|
||||
>>> val = Valuation.fromstring(v)
|
||||
|
||||
>>> print(val) # doctest: +SKIP
|
||||
{'bark': set([('d1',), ('d2',)]),
|
||||
'boy': set([('b1',), ('b2',)]),
|
||||
'chase': set([('b1', 'g1'), ('g2', 'd2'), ('g1', 'd1'), ('b2', 'g1')]),
|
||||
'dog': set([('d1',), ('d2',)]),
|
||||
'fido': 'd1',
|
||||
'girl': set([('g2',), ('g1',)]),
|
||||
'in': set([('d2', 'n'), ('b1', 'n'), ('b2', 'n')]),
|
||||
'john': 'b1',
|
||||
'mary': 'g1',
|
||||
'noosa': 'n',
|
||||
'see': set([('b1', 'g1'), ('b2', 'd2'), ('d2', 'b1'), ('g2', 'n'), ('g1', 'b1')]),
|
||||
'suzie': 'g2',
|
||||
'tess': 'd2',
|
||||
'walk': set([('d1',), ('b1',), ('g2',)]),
|
||||
'with': set([('b1', 'g1'), ('d1', 'b1'), ('b1', 'd1'), ('g1', 'b1')])}
|
||||
|
||||
|
||||
Unit tests for function argument application in a Model
|
||||
-------------------------------------------------------
|
||||
|
||||
>>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\
|
||||
... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])),
|
||||
... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])),
|
||||
... ('kiss', null_binary_rel)]
|
||||
>>> val = Valuation(v)
|
||||
>>> dom = val.domain
|
||||
>>> m = Model(dom, val)
|
||||
>>> g = Assignment(dom)
|
||||
>>> sorted(val['boy'])
|
||||
[('b1',), ('b2',)]
|
||||
>>> ('b1',) in val['boy']
|
||||
True
|
||||
>>> ('g1',) in val['boy']
|
||||
False
|
||||
>>> ('foo',) in val['boy']
|
||||
False
|
||||
>>> ('b1', 'g1') in val['love']
|
||||
True
|
||||
>>> ('b1', 'b1') in val['kiss']
|
||||
False
|
||||
>>> sorted(val.domain)
|
||||
['b1', 'b2', 'd1', 'g1', 'g2']
|
||||
|
||||
|
||||
Model Tests
|
||||
===========
|
||||
|
||||
Extension of Lambda expressions
|
||||
|
||||
>>> v0 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\
|
||||
... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])),
|
||||
... ('dog', set(['d1'])),
|
||||
... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
|
||||
|
||||
>>> val0 = Valuation(v0)
|
||||
>>> dom0 = val0.domain
|
||||
>>> m0 = Model(dom0, val0)
|
||||
>>> g0 = Assignment(dom0)
|
||||
|
||||
>>> print(m0.evaluate(r'\x. \y. love(x, y)', g0) == {'g2': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'b2': {'g2': True, 'b2': False, 'b1': False, 'g1': False, 'd1': False}, 'b1': {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}, 'g1': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'd1': {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}})
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. dog(x) (adam)', g0))
|
||||
False
|
||||
>>> print(m0.evaluate(r'\x. (dog(x) | boy(x)) (adam)', g0))
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. \y. love(x, y)(fido)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False})
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. \y. love(x, y)(adam)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False})
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)', g0) == {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False})
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)(adam)', g0))
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. \y. love(x, y)(betty, adam)', g0))
|
||||
True
|
||||
>>> print(m0.evaluate(r'\y. \x. love(x, y)(fido)(adam)', g0))
|
||||
False
|
||||
>>> print(m0.evaluate(r'\y. \x. love(x, y)(betty, adam)', g0))
|
||||
True
|
||||
>>> print(m0.evaluate(r'\x. exists y. love(x, y)', g0) == {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False})
|
||||
True
|
||||
>>> print(m0.evaluate(r'\z. adam', g0) == {'g2': 'b1', 'b2': 'b1', 'b1': 'b1', 'g1': 'b1', 'd1': 'b1'})
|
||||
True
|
||||
>>> print(m0.evaluate(r'\z. love(x, y)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False})
|
||||
True
|
||||
|
||||
|
||||
Propositional Model Test
|
||||
------------------------
|
||||
|
||||
>>> tests = [
|
||||
... ('P & Q', True),
|
||||
... ('P & R', False),
|
||||
... ('- P', False),
|
||||
... ('- R', True),
|
||||
... ('- - P', True),
|
||||
... ('- (P & R)', True),
|
||||
... ('P | R', True),
|
||||
... ('R | P', True),
|
||||
... ('R | R', False),
|
||||
... ('- P | R', False),
|
||||
... ('P | - P', True),
|
||||
... ('P -> Q', True),
|
||||
... ('P -> R', False),
|
||||
... ('R -> P', True),
|
||||
... ('P <-> P', True),
|
||||
... ('R <-> R', True),
|
||||
... ('P <-> R', False),
|
||||
... ]
|
||||
>>> val1 = Valuation([('P', True), ('Q', True), ('R', False)])
|
||||
>>> dom = set([])
|
||||
>>> m = Model(dom, val1)
|
||||
>>> g = Assignment(dom)
|
||||
>>> for (sent, testvalue) in tests:
|
||||
... semvalue = m.evaluate(sent, g)
|
||||
... if semvalue == testvalue:
|
||||
... print('*', end=' ')
|
||||
* * * * * * * * * * * * * * * * *
|
||||
|
||||
|
||||
Test of i Function
|
||||
------------------
|
||||
|
||||
>>> from nltk.sem import Expression
|
||||
>>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),
|
||||
... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])),
|
||||
... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))]
|
||||
>>> val = Valuation(v)
|
||||
>>> dom = val.domain
|
||||
>>> m = Model(dom, val)
|
||||
>>> g = Assignment(dom, [('x', 'b1'), ('y', 'g2')])
|
||||
>>> exprs = ['adam', 'girl', 'love', 'walks', 'x', 'y', 'z']
|
||||
>>> parsed_exprs = [Expression.fromstring(e) for e in exprs]
|
||||
>>> sorted_set = lambda x: sorted(x) if isinstance(x, set) else x
|
||||
>>> for parsed in parsed_exprs:
|
||||
... try:
|
||||
... print("'%s' gets value %s" % (parsed, sorted_set(m.i(parsed, g))))
|
||||
... except Undefined:
|
||||
... print("'%s' is Undefined" % parsed)
|
||||
'adam' gets value b1
|
||||
'girl' gets value [('g1',), ('g2',)]
|
||||
'love' gets value [('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]
|
||||
'walks' is Undefined
|
||||
'x' gets value b1
|
||||
'y' gets value g2
|
||||
'z' is Undefined
|
||||
|
||||
Test for formulas in Model
|
||||
--------------------------
|
||||
|
||||
>>> tests = [
|
||||
... ('love(adam, betty)', True),
|
||||
... ('love(adam, sue)', 'Undefined'),
|
||||
... ('dog(fido)', True),
|
||||
... ('- dog(fido)', False),
|
||||
... ('- - dog(fido)', True),
|
||||
... ('- dog(sue)', 'Undefined'),
|
||||
... ('dog(fido) & boy(adam)', True),
|
||||
... ('- (dog(fido) & boy(adam))', False),
|
||||
... ('- dog(fido) & boy(adam)', False),
|
||||
... ('dog(fido) | boy(adam)', True),
|
||||
... ('- (dog(fido) | boy(adam))', False),
|
||||
... ('- dog(fido) | boy(adam)', True),
|
||||
... ('- dog(fido) | - boy(adam)', False),
|
||||
... ('dog(fido) -> boy(adam)', True),
|
||||
... ('- (dog(fido) -> boy(adam))', False),
|
||||
... ('- dog(fido) -> boy(adam)', True),
|
||||
... ('exists x . love(adam, x)', True),
|
||||
... ('all x . love(adam, x)', False),
|
||||
... ('fido = fido', True),
|
||||
... ('exists x . all y. love(x, y)', False),
|
||||
... ('exists x . (x = fido)', True),
|
||||
... ('all x . (dog(x) | - dog(x))', True),
|
||||
... ('adam = mia', 'Undefined'),
|
||||
... ('\\x. (boy(x) | girl(x))', {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}),
|
||||
... ('\\x. exists y. (boy(x) & love(x, y))', {'g2': False, 'b2': True, 'b1': True, 'g1': False, 'd1': False}),
|
||||
... ('exists z1. boy(z1)', True),
|
||||
... ('exists x. (boy(x) & - (x = adam))', True),
|
||||
... ('exists x. (boy(x) & all y. love(y, x))', False),
|
||||
... ('all x. (boy(x) | girl(x))', False),
|
||||
... ('all x. (girl(x) -> exists y. boy(y) & love(x, y))', False),
|
||||
... ('exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', True),
|
||||
... ('exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', False),
|
||||
... ('all x. (dog(x) -> - girl(x))', True),
|
||||
... ('exists x. exists y. (love(x, y) & love(x, y))', True),
|
||||
... ]
|
||||
>>> for (sent, testvalue) in tests:
|
||||
... semvalue = m.evaluate(sent, g)
|
||||
... if semvalue == testvalue:
|
||||
... print('*', end=' ')
|
||||
... else:
|
||||
... print(sent, semvalue)
|
||||
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
|
||||
|
||||
|
||||
|
||||
Satisfier Tests
|
||||
---------------
|
||||
|
||||
>>> formulas = [
|
||||
... 'boy(x)',
|
||||
... '(x = x)',
|
||||
... '(boy(x) | girl(x))',
|
||||
... '(boy(x) & girl(x))',
|
||||
... 'love(adam, x)',
|
||||
... 'love(x, adam)',
|
||||
... '- (x = adam)',
|
||||
... 'exists z22. love(x, z22)',
|
||||
... 'exists y. love(y, x)',
|
||||
... 'all y. (girl(y) -> love(x, y))',
|
||||
... 'all y. (girl(y) -> love(y, x))',
|
||||
... 'all y. (girl(y) -> (boy(x) & love(y, x)))',
|
||||
... 'boy(x) & all y. (girl(y) -> love(x, y))',
|
||||
... 'boy(x) & all y. (girl(y) -> love(y, x))',
|
||||
... 'boy(x) & exists y. (girl(y) & love(y, x))',
|
||||
... 'girl(x) -> dog(x)',
|
||||
... 'all y. (dog(y) -> (x = y))',
|
||||
... '- exists y. love(y, x)',
|
||||
... 'exists y. (love(adam, y) & love(y, x))'
|
||||
... ]
|
||||
>>> g.purge()
|
||||
>>> g.add('x', 'b1')
|
||||
{'x': 'b1'}
|
||||
>>> for f in formulas:
|
||||
... try:
|
||||
... print("'%s' gets value: %s" % (f, m.evaluate(f, g)))
|
||||
... except Undefined:
|
||||
... print("'%s' is Undefined" % f)
|
||||
'boy(x)' gets value: True
|
||||
'(x = x)' gets value: True
|
||||
'(boy(x) | girl(x))' gets value: True
|
||||
'(boy(x) & girl(x))' gets value: False
|
||||
'love(adam, x)' gets value: False
|
||||
'love(x, adam)' gets value: False
|
||||
'- (x = adam)' gets value: False
|
||||
'exists z22. love(x, z22)' gets value: True
|
||||
'exists y. love(y, x)' gets value: True
|
||||
'all y. (girl(y) -> love(x, y))' gets value: False
|
||||
'all y. (girl(y) -> love(y, x))' gets value: True
|
||||
'all y. (girl(y) -> (boy(x) & love(y, x)))' gets value: True
|
||||
'boy(x) & all y. (girl(y) -> love(x, y))' gets value: False
|
||||
'boy(x) & all y. (girl(y) -> love(y, x))' gets value: True
|
||||
'boy(x) & exists y. (girl(y) & love(y, x))' gets value: True
|
||||
'girl(x) -> dog(x)' gets value: True
|
||||
'all y. (dog(y) -> (x = y))' gets value: False
|
||||
'- exists y. love(y, x)' gets value: False
|
||||
'exists y. (love(adam, y) & love(y, x))' gets value: True
|
||||
|
||||
>>> from nltk.sem import Expression
|
||||
>>> for fmla in formulas:
|
||||
... p = Expression.fromstring(fmla)
|
||||
... g.purge()
|
||||
... print("Satisfiers of '%s':\n\t%s" % (p, sorted(m.satisfiers(p, 'x', g))))
|
||||
Satisfiers of 'boy(x)':
|
||||
['b1', 'b2']
|
||||
Satisfiers of '(x = x)':
|
||||
['b1', 'b2', 'd1', 'g1', 'g2']
|
||||
Satisfiers of '(boy(x) | girl(x))':
|
||||
['b1', 'b2', 'g1', 'g2']
|
||||
Satisfiers of '(boy(x) & girl(x))':
|
||||
[]
|
||||
Satisfiers of 'love(adam,x)':
|
||||
['g1']
|
||||
Satisfiers of 'love(x,adam)':
|
||||
['g1', 'g2']
|
||||
Satisfiers of '-(x = adam)':
|
||||
['b2', 'd1', 'g1', 'g2']
|
||||
Satisfiers of 'exists z22.love(x,z22)':
|
||||
['b1', 'b2', 'g1', 'g2']
|
||||
Satisfiers of 'exists y.love(y,x)':
|
||||
['b1', 'g1', 'g2']
|
||||
Satisfiers of 'all y.(girl(y) -> love(x,y))':
|
||||
[]
|
||||
Satisfiers of 'all y.(girl(y) -> love(y,x))':
|
||||
['b1']
|
||||
Satisfiers of 'all y.(girl(y) -> (boy(x) & love(y,x)))':
|
||||
['b1']
|
||||
Satisfiers of '(boy(x) & all y.(girl(y) -> love(x,y)))':
|
||||
[]
|
||||
Satisfiers of '(boy(x) & all y.(girl(y) -> love(y,x)))':
|
||||
['b1']
|
||||
Satisfiers of '(boy(x) & exists y.(girl(y) & love(y,x)))':
|
||||
['b1']
|
||||
Satisfiers of '(girl(x) -> dog(x))':
|
||||
['b1', 'b2', 'd1']
|
||||
Satisfiers of 'all y.(dog(y) -> (x = y))':
|
||||
['d1']
|
||||
Satisfiers of '-exists y.love(y,x)':
|
||||
['b2', 'd1']
|
||||
Satisfiers of 'exists y.(love(adam,y) & love(y,x))':
|
||||
['b1']
|
||||
|
||||
|
||||
Tests based on the Blackburn & Bos testsuite
|
||||
--------------------------------------------
|
||||
|
||||
>>> v1 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'),
|
||||
... ('honey_bunny', 'd4'), ('yolanda', 'd5'),
|
||||
... ('customer', set(['d1', 'd2'])),
|
||||
... ('robber', set(['d3', 'd4'])),
|
||||
... ('love', set([('d3', 'd4')]))]
|
||||
>>> val1 = Valuation(v1)
|
||||
>>> dom1 = val1.domain
|
||||
>>> m1 = Model(dom1, val1)
|
||||
>>> g1 = Assignment(dom1)
|
||||
|
||||
>>> v2 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'),
|
||||
... ('honey_bunny', 'd4'), ('yolanda', 'd4'),
|
||||
... ('customer', set(['d1', 'd2', 'd5', 'd6'])),
|
||||
... ('robber', set(['d3', 'd4'])),
|
||||
... ('love', set([(None, None)]))]
|
||||
>>> val2 = Valuation(v2)
|
||||
>>> dom2 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6'])
|
||||
>>> m2 = Model(dom2, val2)
|
||||
>>> g2 = Assignment(dom2)
|
||||
>>> g21 = Assignment(dom2)
|
||||
>>> g21.add('y', 'd3')
|
||||
{'y': 'd3'}
|
||||
|
||||
>>> v3 = [('mia', 'd1'), ('jody', 'd2'), ('jules', 'd3'),
|
||||
... ('vincent', 'd4'),
|
||||
... ('woman', set(['d1', 'd2'])), ('man', set(['d3', 'd4'])),
|
||||
... ('joke', set(['d5', 'd6'])), ('episode', set(['d7', 'd8'])),
|
||||
... ('in', set([('d5', 'd7'), ('d5', 'd8')])),
|
||||
... ('tell', set([('d1', 'd5'), ('d2', 'd6')]))]
|
||||
>>> val3 = Valuation(v3)
|
||||
>>> dom3 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8'])
|
||||
>>> m3 = Model(dom3, val3)
|
||||
>>> g3 = Assignment(dom3)
|
||||
|
||||
>>> tests = [
|
||||
... ('exists x. robber(x)', m1, g1, True),
|
||||
... ('exists x. exists y. love(y, x)', m1, g1, True),
|
||||
... ('exists x0. exists x1. love(x1, x0)', m2, g2, False),
|
||||
... ('all x. all y. love(y, x)', m2, g2, False),
|
||||
... ('- (all x. all y. love(y, x))', m2, g2, True),
|
||||
... ('all x. all y. - love(y, x)', m2, g2, True),
|
||||
... ('yolanda = honey_bunny', m2, g2, True),
|
||||
... ('mia = honey_bunny', m2, g2, 'Undefined'),
|
||||
... ('- (yolanda = honey_bunny)', m2, g2, False),
|
||||
... ('- (mia = honey_bunny)', m2, g2, 'Undefined'),
|
||||
... ('all x. (robber(x) | customer(x))', m2, g2, True),
|
||||
... ('- (all x. (robber(x) | customer(x)))', m2, g2, False),
|
||||
... ('(robber(x) | customer(x))', m2, g2, 'Undefined'),
|
||||
... ('(robber(y) | customer(y))', m2, g21, True),
|
||||
... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True),
|
||||
... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True),
|
||||
... ('- exists x. woman(x)', m3, g3, False),
|
||||
... ('exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'),
|
||||
... ('- exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'),
|
||||
... ('exists x. (man(x) & - exists y. woman(y))', m3, g3, False),
|
||||
... ('exists x. (man(x) & - exists x. woman(x))', m3, g3, False),
|
||||
... ('exists x. (woman(x) & - exists x. customer(x))', m2, g2, 'Undefined'),
|
||||
... ]
|
||||
|
||||
>>> for item in tests:
|
||||
... sentence, model, g, testvalue = item
|
||||
... semvalue = model.evaluate(sentence, g)
|
||||
... if semvalue == testvalue:
|
||||
... print('*', end=' ')
|
||||
... g.purge()
|
||||
* * * * * * * * * * * * * * * * * * * * * *
|
||||
|
||||
|
||||
Tests for mapping from syntax to semantics
|
||||
------------------------------------------
|
||||
|
||||
Load a valuation from a file.
|
||||
|
||||
>>> import nltk.data
|
||||
>>> from nltk.sem.util import parse_sents
|
||||
>>> val = nltk.data.load('grammars/sample_grammars/valuation1.val')
|
||||
>>> dom = val.domain
|
||||
>>> m = Model(dom, val)
|
||||
>>> g = Assignment(dom)
|
||||
>>> gramfile = 'grammars/sample_grammars/sem2.fcfg'
|
||||
>>> inputs = ['John sees a girl', 'every dog barks']
|
||||
>>> parses = parse_sents(inputs, gramfile)
|
||||
>>> for sent, trees in zip(inputs, parses):
|
||||
... print()
|
||||
... print("Sentence: %s" % sent)
|
||||
... for tree in trees:
|
||||
... print("Parse:\n %s" %tree)
|
||||
... print("Semantics: %s" % root_semrep(tree))
|
||||
<BLANKLINE>
|
||||
Sentence: John sees a girl
|
||||
Parse:
|
||||
(S[SEM=<exists x.(girl(x) & see(john,x))>]
|
||||
(NP[-LOC, NUM='sg', SEM=<\P.P(john)>]
|
||||
(PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John))
|
||||
(VP[NUM='sg', SEM=<\y.exists x.(girl(x) & see(y,x))>]
|
||||
(TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees)
|
||||
(NP[NUM='sg', SEM=<\Q.exists x.(girl(x) & Q(x))>]
|
||||
(Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a)
|
||||
(Nom[NUM='sg', SEM=<\x.girl(x)>]
|
||||
(N[NUM='sg', SEM=<\x.girl(x)>] girl)))))
|
||||
Semantics: exists x.(girl(x) & see(john,x))
|
||||
<BLANKLINE>
|
||||
Sentence: every dog barks
|
||||
Parse:
|
||||
(S[SEM=<all x.(dog(x) -> bark(x))>]
|
||||
(NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>]
|
||||
(Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every)
|
||||
(Nom[NUM='sg', SEM=<\x.dog(x)>]
|
||||
(N[NUM='sg', SEM=<\x.dog(x)>] dog)))
|
||||
(VP[NUM='sg', SEM=<\x.bark(x)>]
|
||||
(IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks)))
|
||||
Semantics: all x.(dog(x) -> bark(x))
|
||||
|
||||
>>> sent = "every dog barks"
|
||||
>>> result = nltk.sem.util.interpret_sents([sent], gramfile)[0]
|
||||
>>> for (syntree, semrep) in result:
|
||||
... print(syntree)
|
||||
... print()
|
||||
... print(semrep)
|
||||
(S[SEM=<all x.(dog(x) -> bark(x))>]
|
||||
(NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>]
|
||||
(Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every)
|
||||
(Nom[NUM='sg', SEM=<\x.dog(x)>]
|
||||
(N[NUM='sg', SEM=<\x.dog(x)>] dog)))
|
||||
(VP[NUM='sg', SEM=<\x.bark(x)>]
|
||||
(IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks)))
|
||||
<BLANKLINE>
|
||||
all x.(dog(x) -> bark(x))
|
||||
|
||||
>>> result = nltk.sem.util.evaluate_sents([sent], gramfile, m, g)[0]
|
||||
>>> for (syntree, semrel, value) in result:
|
||||
... print(syntree)
|
||||
... print()
|
||||
... print(semrep)
|
||||
... print()
|
||||
... print(value)
|
||||
(S[SEM=<all x.(dog(x) -> bark(x))>]
|
||||
(NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>]
|
||||
(Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every)
|
||||
(Nom[NUM='sg', SEM=<\x.dog(x)>]
|
||||
(N[NUM='sg', SEM=<\x.dog(x)>] dog)))
|
||||
(VP[NUM='sg', SEM=<\x.bark(x)>]
|
||||
(IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks)))
|
||||
<BLANKLINE>
|
||||
all x.(dog(x) -> bark(x))
|
||||
<BLANKLINE>
|
||||
True
|
||||
|
||||
>>> sents = ['Mary walks', 'John sees a dog']
|
||||
>>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg')
|
||||
>>> for result in results:
|
||||
... for (synrep, semrep) in result:
|
||||
... print(synrep)
|
||||
(S[SEM=<walk(mary)>]
|
||||
(NP[-LOC, NUM='sg', SEM=<\P.P(mary)>]
|
||||
(PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary))
|
||||
(VP[NUM='sg', SEM=<\x.walk(x)>]
|
||||
(IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks)))
|
||||
(S[SEM=<exists x.(dog(x) & see(john,x))>]
|
||||
(NP[-LOC, NUM='sg', SEM=<\P.P(john)>]
|
||||
(PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John))
|
||||
(VP[NUM='sg', SEM=<\y.exists x.(dog(x) & see(y,x))>]
|
||||
(TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees)
|
||||
(NP[NUM='sg', SEM=<\Q.exists x.(dog(x) & Q(x))>]
|
||||
(Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a)
|
||||
(Nom[NUM='sg', SEM=<\x.dog(x)>]
|
||||
(N[NUM='sg', SEM=<\x.dog(x)>] dog)))))
|
||||
|
||||
Cooper Storage
|
||||
--------------
|
||||
|
||||
>>> from nltk.sem import cooper_storage as cs
|
||||
>>> sentence = 'every girl chases a dog'
|
||||
>>> trees = cs.parse_with_bindops(sentence, grammar='grammars/book_grammars/storage.fcfg')
|
||||
>>> semrep = trees[0].label()['SEM']
|
||||
>>> cs_semrep = cs.CooperStore(semrep)
|
||||
>>> print(cs_semrep.core)
|
||||
chase(z2,z4)
|
||||
>>> for bo in cs_semrep.store:
|
||||
... print(bo)
|
||||
bo(\P.all x.(girl(x) -> P(x)),z2)
|
||||
bo(\P.exists x.(dog(x) & P(x)),z4)
|
||||
>>> cs_semrep.s_retrieve(trace=True)
|
||||
Permutation 1
|
||||
(\P.all x.(girl(x) -> P(x)))(\z2.chase(z2,z4))
|
||||
(\P.exists x.(dog(x) & P(x)))(\z4.all x.(girl(x) -> chase(x,z4)))
|
||||
Permutation 2
|
||||
(\P.exists x.(dog(x) & P(x)))(\z4.chase(z2,z4))
|
||||
(\P.all x.(girl(x) -> P(x)))(\z2.exists x.(dog(x) & chase(z2,x)))
|
||||
|
||||
>>> for reading in cs_semrep.readings:
|
||||
... print(reading)
|
||||
exists x.(dog(x) & all z3.(girl(z3) -> chase(z3,x)))
|
||||
all x.(girl(x) -> exists z4.(dog(z4) & chase(x,z4)))
|
||||
@@ -0,0 +1,236 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===================
|
||||
Sentiment Analysis
|
||||
===================
|
||||
|
||||
>>> from nltk.classify import NaiveBayesClassifier
|
||||
>>> from nltk.corpus import subjectivity
|
||||
>>> from nltk.sentiment import SentimentAnalyzer
|
||||
>>> from nltk.sentiment.util import *
|
||||
|
||||
>>> n_instances = 100
|
||||
>>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
|
||||
>>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
|
||||
>>> len(subj_docs), len(obj_docs)
|
||||
(100, 100)
|
||||
|
||||
Each document is represented by a tuple (sentence, label). The sentence is tokenized,
|
||||
so it is represented by a list of strings:
|
||||
|
||||
>>> subj_docs[0]
|
||||
(['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one',
|
||||
'thing', 'is', 'a', 'small', 'gem', '.'], 'subj')
|
||||
|
||||
We separately split subjective and objective instances to keep a balanced uniform
|
||||
class distribution in both train and test sets.
|
||||
|
||||
>>> train_subj_docs = subj_docs[:80]
|
||||
>>> test_subj_docs = subj_docs[80:100]
|
||||
>>> train_obj_docs = obj_docs[:80]
|
||||
>>> test_obj_docs = obj_docs[80:100]
|
||||
>>> training_docs = train_subj_docs+train_obj_docs
|
||||
>>> testing_docs = test_subj_docs+test_obj_docs
|
||||
|
||||
>>> sentim_analyzer = SentimentAnalyzer()
|
||||
>>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
|
||||
|
||||
We use simple unigram word features, handling negation:
|
||||
|
||||
>>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
|
||||
>>> len(unigram_feats)
|
||||
83
|
||||
>>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
|
||||
|
||||
We apply features to obtain a feature-value representation of our datasets:
|
||||
|
||||
>>> training_set = sentim_analyzer.apply_features(training_docs)
|
||||
>>> test_set = sentim_analyzer.apply_features(testing_docs)
|
||||
|
||||
We can now train our classifier on the training set, and subsequently output the
|
||||
evaluation results:
|
||||
|
||||
>>> trainer = NaiveBayesClassifier.train
|
||||
>>> classifier = sentim_analyzer.train(trainer, training_set)
|
||||
Training classifier
|
||||
>>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
|
||||
... print('{0}: {1}'.format(key, value))
|
||||
Evaluating NaiveBayesClassifier results...
|
||||
Accuracy: 0.8
|
||||
F-measure [obj]: 0.8
|
||||
F-measure [subj]: 0.8
|
||||
Precision [obj]: 0.8
|
||||
Precision [subj]: 0.8
|
||||
Recall [obj]: 0.8
|
||||
Recall [subj]: 0.8
|
||||
|
||||
|
||||
Vader
|
||||
------
|
||||
|
||||
>>> from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||
>>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example
|
||||
... "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
|
||||
... "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted)
|
||||
... "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled
|
||||
... "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity
|
||||
... "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score
|
||||
... "The book was good.", # positive sentence
|
||||
... "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
|
||||
... "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
|
||||
... "A really bad, horrible book.", # negative sentence with booster words
|
||||
... "At least it isn't a horrible book.", # negated negative sentence with contraction
|
||||
... ":) and :D", # emoticons handled
|
||||
... "", # an empty string is correctly handled
|
||||
... "Today sux", # negative slang handled
|
||||
... "Today sux!", # negative slang with punctuation emphasis handled
|
||||
... "Today SUX!", # negative slang with capitalization emphasis
|
||||
... "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but"
|
||||
... ]
|
||||
>>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \
|
||||
... Unbelievably bad acting!! Poor direction. VERY poor production. \
|
||||
... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"
|
||||
|
||||
>>> from nltk import tokenize
|
||||
>>> lines_list = tokenize.sent_tokenize(paragraph)
|
||||
>>> sentences.extend(lines_list)
|
||||
|
||||
>>> tricky_sentences = [
|
||||
... "Most automated sentiment analysis tools are shit.",
|
||||
... "VADER sentiment analysis is the shit.",
|
||||
... "Sentiment analysis has never been good.",
|
||||
... "Sentiment analysis with VADER has never been this good.",
|
||||
... "Warren Beatty has never been so entertaining.",
|
||||
... "I won't say that the movie is astounding and I wouldn't claim that \
|
||||
... the movie is too banal either.",
|
||||
... "I like to hate Michael Bay films, but I couldn't fault this one",
|
||||
... "I like to hate Michael Bay films, BUT I couldn't help but fault this one",
|
||||
... "It's one thing to watch an Uwe Boll film, but another thing entirely \
|
||||
... to pay for it",
|
||||
... "The movie was too good",
|
||||
... "This movie was actually neither that funny, nor super witty.",
|
||||
... "This movie doesn't care about cleverness, wit or any other kind of \
|
||||
... intelligent humor.",
|
||||
... "Those who find ugly meanings in beautiful things are corrupt without \
|
||||
... being charming.",
|
||||
... "There are slow and repetitive parts, BUT it has just enough spice to \
|
||||
... keep it interesting.",
|
||||
... "The script is not fantastic, but the acting is decent and the cinematography \
|
||||
... is EXCELLENT!",
|
||||
... "Roger Dodger is one of the most compelling variations on this theme.",
|
||||
... "Roger Dodger is one of the least compelling variations on this theme.",
|
||||
... "Roger Dodger is at least compelling as a variation on the theme.",
|
||||
... "they fall in love with the product",
|
||||
... "but then it breaks",
|
||||
... "usually around the time the 90 day warranty expires",
|
||||
... "the twin towers collapsed today",
|
||||
... "However, Mr. Carter solemnly argues, his client carried out the kidnapping \
|
||||
... under orders and in the ''least offensive way possible.''"
|
||||
... ]
|
||||
>>> sentences.extend(tricky_sentences)
|
||||
>>> for sentence in sentences:
|
||||
... sid = SentimentIntensityAnalyzer()
|
||||
... print(sentence)
|
||||
... ss = sid.polarity_scores(sentence)
|
||||
... for k in sorted(ss):
|
||||
... print('{0}: {1}, '.format(k, ss[k]), end='')
|
||||
... print()
|
||||
VADER is smart, handsome, and funny.
|
||||
compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746,
|
||||
VADER is smart, handsome, and funny!
|
||||
compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752,
|
||||
VADER is very smart, handsome, and funny.
|
||||
compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701,
|
||||
VADER is VERY SMART, handsome, and FUNNY.
|
||||
compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754,
|
||||
VADER is VERY SMART, handsome, and FUNNY!!!
|
||||
compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767,
|
||||
VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!
|
||||
compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706,
|
||||
The book was good.
|
||||
compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492,
|
||||
The book was kind of good.
|
||||
compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343,
|
||||
The plot was good, but the characters are uncompelling and the dialog is not great.
|
||||
compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094,
|
||||
A really bad, horrible book.
|
||||
compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0,
|
||||
At least it isn't a horrible book.
|
||||
compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363,
|
||||
:) and :D
|
||||
compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876,
|
||||
<BLANKLINE>
|
||||
compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0,
|
||||
Today sux
|
||||
compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0,
|
||||
Today sux!
|
||||
compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0,
|
||||
Today SUX!
|
||||
compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0,
|
||||
Today kinda sux! But I'll get by, lol
|
||||
compound: 0.5249, neg: 0.138, neu: 0.517, pos: 0.344,
|
||||
It was one of the worst movies I've seen, despite good reviews.
|
||||
compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0,
|
||||
Unbelievably bad acting!!
|
||||
compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0,
|
||||
Poor direction.
|
||||
compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0,
|
||||
VERY poor production.
|
||||
compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0,
|
||||
The movie was bad.
|
||||
compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0,
|
||||
Very bad movie.
|
||||
compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0,
|
||||
VERY bad movie.
|
||||
compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0,
|
||||
VERY BAD movie.
|
||||
compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0,
|
||||
VERY BAD movie!
|
||||
compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0,
|
||||
Most automated sentiment analysis tools are shit.
|
||||
compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0,
|
||||
VADER sentiment analysis is the shit.
|
||||
compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444,
|
||||
Sentiment analysis has never been good.
|
||||
compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0,
|
||||
Sentiment analysis with VADER has never been this good.
|
||||
compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297,
|
||||
Warren Beatty has never been so entertaining.
|
||||
compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384,
|
||||
I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either.
|
||||
compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149,
|
||||
I like to hate Michael Bay films, but I couldn't fault this one
|
||||
compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309,
|
||||
I like to hate Michael Bay films, BUT I couldn't help but fault this one
|
||||
compound: -0.1531, neg: 0.277, neu: 0.477, pos: 0.246,
|
||||
It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it
|
||||
compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0,
|
||||
The movie was too good
|
||||
compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42,
|
||||
This movie was actually neither that funny, nor super witty.
|
||||
compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0,
|
||||
This movie doesn't care about cleverness, wit or any other kind of intelligent humor.
|
||||
compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239,
|
||||
Those who find ugly meanings in beautiful things are corrupt without being charming.
|
||||
compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192,
|
||||
There are slow and repetitive parts, BUT it has just enough spice to keep it interesting.
|
||||
compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186,
|
||||
The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT!
|
||||
compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301,
|
||||
Roger Dodger is one of the most compelling variations on this theme.
|
||||
compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166,
|
||||
Roger Dodger is one of the least compelling variations on this theme.
|
||||
compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0,
|
||||
Roger Dodger is at least compelling as a variation on the theme.
|
||||
compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16,
|
||||
they fall in love with the product
|
||||
compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412,
|
||||
but then it breaks
|
||||
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0,
|
||||
usually around the time the 90 day warranty expires
|
||||
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0,
|
||||
the twin towers collapsed today
|
||||
compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0,
|
||||
However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.''
|
||||
compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074,
|
||||
@@ -0,0 +1,41 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
======================
|
||||
SentiWordNet Interface
|
||||
======================
|
||||
|
||||
SentiWordNet can be imported like this:
|
||||
|
||||
>>> from nltk.corpus import sentiwordnet as swn
|
||||
|
||||
------------
|
||||
SentiSynsets
|
||||
------------
|
||||
|
||||
>>> breakdown = swn.senti_synset('breakdown.n.03')
|
||||
>>> print(breakdown)
|
||||
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
|
||||
>>> breakdown.pos_score()
|
||||
0.0
|
||||
>>> breakdown.neg_score()
|
||||
0.25
|
||||
>>> breakdown.obj_score()
|
||||
0.75
|
||||
|
||||
|
||||
------
|
||||
Lookup
|
||||
------
|
||||
|
||||
>>> list(swn.senti_synsets('slow'))
|
||||
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),
|
||||
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),
|
||||
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),
|
||||
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),
|
||||
SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),
|
||||
SentiSynset('behind.r.03')]
|
||||
|
||||
>>> happy = swn.senti_synsets('happy', 'a')
|
||||
|
||||
>>> all = swn.all_senti_synsets()
|
||||
@@ -0,0 +1,26 @@
|
||||
from nltk.internals import find_binary, find_jar
|
||||
|
||||
|
||||
def check_binary(binary: str, **args):
|
||||
"""Skip a test via `pytest.skip` if the `binary` executable is not found.
|
||||
Keyword arguments are passed to `nltk.internals.find_binary`."""
|
||||
import pytest
|
||||
|
||||
try:
|
||||
find_binary(binary, **args)
|
||||
except LookupError:
|
||||
pytest.skip(f"Skipping test because the {binary} binary was not found.")
|
||||
|
||||
|
||||
def check_jar(name_pattern: str, **args):
|
||||
"""Skip a test via `pytest.skip` if the `name_pattern` jar is not found.
|
||||
Keyword arguments are passed to `nltk.internals.find_jar`.
|
||||
|
||||
TODO: Investigate why the CoreNLP tests that rely on this check_jar failed
|
||||
on the CI. https://github.com/nltk/nltk/pull/3060#issuecomment-1268355108
|
||||
"""
|
||||
import pytest
|
||||
|
||||
pytest.skip(
|
||||
"Skipping test because the doctests requiring jars are inconsistent on the CI."
|
||||
)
|
||||
@@ -0,0 +1,83 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=================
|
||||
EasyInstall Tests
|
||||
=================
|
||||
|
||||
This file contains some simple tests that will be run by EasyInstall in
|
||||
order to test the installation when NLTK-Data is absent.
|
||||
|
||||
|
||||
------------
|
||||
Tokenization
|
||||
------------
|
||||
|
||||
>>> from nltk.tokenize import wordpunct_tokenize
|
||||
>>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
|
||||
... "two of them.\n\nThanks.")
|
||||
>>> wordpunct_tokenize(s)
|
||||
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
|
||||
'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
|
||||
|
||||
-------
|
||||
Metrics
|
||||
-------
|
||||
|
||||
>>> from nltk.metrics import precision, recall, f_measure
|
||||
>>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split()
|
||||
>>> test = 'DET VB VB DET NN NN NN IN DET NN'.split()
|
||||
>>> reference_set = set(reference)
|
||||
>>> test_set = set(test)
|
||||
>>> precision(reference_set, test_set)
|
||||
1.0
|
||||
>>> print(recall(reference_set, test_set))
|
||||
0.8
|
||||
>>> print(f_measure(reference_set, test_set))
|
||||
0.88888888888...
|
||||
|
||||
------------------
|
||||
Feature Structures
|
||||
------------------
|
||||
|
||||
>>> from nltk import FeatStruct
|
||||
>>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem')
|
||||
>>> fs2 = FeatStruct(POS='N', AGR=fs1)
|
||||
>>> print(fs2)
|
||||
[ [ GND = 'fem' ] ]
|
||||
[ AGR = [ NUM = 'pl' ] ]
|
||||
[ [ PER = 3 ] ]
|
||||
[ ]
|
||||
[ POS = 'N' ]
|
||||
>>> print(fs2['AGR'])
|
||||
[ GND = 'fem' ]
|
||||
[ NUM = 'pl' ]
|
||||
[ PER = 3 ]
|
||||
>>> print(fs2['AGR']['PER'])
|
||||
3
|
||||
|
||||
-------
|
||||
Parsing
|
||||
-------
|
||||
|
||||
>>> from nltk.parse.recursivedescent import RecursiveDescentParser
|
||||
>>> from nltk.grammar import CFG
|
||||
>>> grammar = CFG.fromstring("""
|
||||
... S -> NP VP
|
||||
... PP -> P NP
|
||||
... NP -> 'the' N | N PP | 'the' N PP
|
||||
... VP -> V NP | V PP | V NP PP
|
||||
... N -> 'cat' | 'dog' | 'rug'
|
||||
... V -> 'chased'
|
||||
... P -> 'on'
|
||||
... """)
|
||||
>>> rd = RecursiveDescentParser(grammar)
|
||||
>>> sent = 'the cat chased the dog on the rug'.split()
|
||||
>>> for t in rd.parse(sent):
|
||||
... print(t)
|
||||
(S
|
||||
(NP the (N cat))
|
||||
(VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug))))))
|
||||
(S
|
||||
(NP the (N cat))
|
||||
(VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug)))))
|
||||
105
Backend/venv/lib/python3.12/site-packages/nltk/test/stem.doctest
Normal file
105
Backend/venv/lib/python3.12/site-packages/nltk/test/stem.doctest
Normal file
@@ -0,0 +1,105 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
==========
|
||||
Stemmers
|
||||
==========
|
||||
|
||||
Overview
|
||||
~~~~~~~~
|
||||
|
||||
Stemmers remove morphological affixes from words, leaving only the
|
||||
word stem.
|
||||
|
||||
>>> from nltk.stem import *
|
||||
|
||||
Unit tests for the Porter stemmer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
>>> from nltk.stem.porter import *
|
||||
|
||||
Create a new Porter stemmer.
|
||||
|
||||
>>> stemmer = PorterStemmer()
|
||||
|
||||
Test the stemmer on various pluralised words.
|
||||
|
||||
>>> plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
|
||||
... 'died', 'agreed', 'owned', 'humbled', 'sized',
|
||||
... 'meeting', 'stating', 'siezing', 'itemization',
|
||||
... 'sensational', 'traditional', 'reference', 'colonizer',
|
||||
... 'plotted']
|
||||
|
||||
>>> singles = [stemmer.stem(plural) for plural in plurals]
|
||||
|
||||
>>> print(' '.join(singles))
|
||||
caress fli die mule deni die agre own humbl size meet
|
||||
state siez item sensat tradit refer colon plot
|
||||
|
||||
|
||||
Unit tests for Snowball stemmer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
>>> from nltk.stem.snowball import SnowballStemmer
|
||||
|
||||
See which languages are supported.
|
||||
|
||||
>>> print(" ".join(SnowballStemmer.languages))
|
||||
arabic danish dutch english finnish french german hungarian italian
|
||||
norwegian porter portuguese romanian russian spanish swedish
|
||||
|
||||
Create a new instance of a language specific subclass.
|
||||
|
||||
>>> stemmer = SnowballStemmer("english")
|
||||
|
||||
Stem a word.
|
||||
|
||||
>>> print(stemmer.stem("running"))
|
||||
run
|
||||
|
||||
Decide not to stem stopwords.
|
||||
|
||||
>>> stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
|
||||
>>> print(stemmer.stem("having"))
|
||||
have
|
||||
>>> print(stemmer2.stem("having"))
|
||||
having
|
||||
|
||||
The 'english' stemmer is better than the original 'porter' stemmer.
|
||||
|
||||
>>> print(SnowballStemmer("english").stem("generously"))
|
||||
generous
|
||||
>>> print(SnowballStemmer("porter").stem("generously"))
|
||||
gener
|
||||
|
||||
.. note::
|
||||
|
||||
Extra stemmer tests can be found in `nltk.test.unit.test_stem`.
|
||||
|
||||
Unit tests for ARLSTem Stemmer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
>>> from nltk.stem.arlstem import ARLSTem
|
||||
|
||||
Create a Stemmer instance.
|
||||
|
||||
>>> stemmer = ARLSTem()
|
||||
|
||||
Stem a word.
|
||||
|
||||
>>> stemmer.stem('يعمل')
|
||||
'عمل'
|
||||
|
||||
Unit tests for ARLSTem2 Stemmer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
>>> from nltk.stem.arlstem2 import ARLSTem2
|
||||
|
||||
Create a Stemmer instance.
|
||||
|
||||
>>> stemmer = ARLSTem2()
|
||||
|
||||
Stem a word.
|
||||
|
||||
>>> stemmer.stem('يعمل')
|
||||
'عمل'
|
||||
472
Backend/venv/lib/python3.12/site-packages/nltk/test/tag.doctest
Normal file
472
Backend/venv/lib/python3.12/site-packages/nltk/test/tag.doctest
Normal file
@@ -0,0 +1,472 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
Evaluation of Taggers
|
||||
=====================
|
||||
|
||||
Evaluating the standard NLTK PerceptronTagger using Accuracy,
|
||||
Precision, Recall and F-measure for each of the tags.
|
||||
|
||||
>>> from nltk.tag import PerceptronTagger
|
||||
>>> from nltk.corpus import treebank
|
||||
>>> tagger = PerceptronTagger()
|
||||
>>> gold_data = treebank.tagged_sents()[10:20]
|
||||
>>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS
|
||||
0.885931...
|
||||
|
||||
>>> print(tagger.evaluate_per_tag(gold_data))
|
||||
Tag | Prec. | Recall | F-measure
|
||||
-------+--------+--------+-----------
|
||||
'' | 1.0000 | 1.0000 | 1.0000
|
||||
, | 1.0000 | 1.0000 | 1.0000
|
||||
-NONE- | 0.0000 | 0.0000 | 0.0000
|
||||
. | 1.0000 | 1.0000 | 1.0000
|
||||
: | 1.0000 | 1.0000 | 1.0000
|
||||
CC | 1.0000 | 1.0000 | 1.0000
|
||||
CD | 0.7647 | 1.0000 | 0.8667
|
||||
DT | 1.0000 | 1.0000 | 1.0000
|
||||
IN | 1.0000 | 1.0000 | 1.0000
|
||||
JJ | 0.5882 | 0.8333 | 0.6897
|
||||
JJR | 1.0000 | 1.0000 | 1.0000
|
||||
JJS | 1.0000 | 1.0000 | 1.0000
|
||||
NN | 0.7647 | 0.9630 | 0.8525
|
||||
NNP | 0.8929 | 1.0000 | 0.9434
|
||||
NNS | 1.0000 | 1.0000 | 1.0000
|
||||
POS | 1.0000 | 1.0000 | 1.0000
|
||||
PRP | 1.0000 | 1.0000 | 1.0000
|
||||
RB | 0.8000 | 1.0000 | 0.8889
|
||||
RBR | 0.0000 | 0.0000 | 0.0000
|
||||
TO | 1.0000 | 1.0000 | 1.0000
|
||||
VB | 1.0000 | 1.0000 | 1.0000
|
||||
VBD | 0.8571 | 0.9231 | 0.8889
|
||||
VBG | 1.0000 | 1.0000 | 1.0000
|
||||
VBN | 0.8333 | 0.5556 | 0.6667
|
||||
VBP | 0.5714 | 0.8000 | 0.6667
|
||||
VBZ | 1.0000 | 1.0000 | 1.0000
|
||||
WP | 1.0000 | 1.0000 | 1.0000
|
||||
`` | 1.0000 | 1.0000 | 1.0000
|
||||
<BLANKLINE>
|
||||
|
||||
List only the 10 most common tags:
|
||||
|
||||
>>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True))
|
||||
Tag | Prec. | Recall | F-measure
|
||||
-------+--------+--------+-----------
|
||||
IN | 1.0000 | 1.0000 | 1.0000
|
||||
DT | 1.0000 | 1.0000 | 1.0000
|
||||
NN | 0.7647 | 0.9630 | 0.8525
|
||||
NNP | 0.8929 | 1.0000 | 0.9434
|
||||
NNS | 1.0000 | 1.0000 | 1.0000
|
||||
-NONE- | 0.0000 | 0.0000 | 0.0000
|
||||
CD | 0.7647 | 1.0000 | 0.8667
|
||||
VBD | 0.8571 | 0.9231 | 0.8889
|
||||
JJ | 0.5882 | 0.8333 | 0.6897
|
||||
, | 1.0000 | 1.0000 | 1.0000
|
||||
<BLANKLINE>
|
||||
|
||||
Similarly, we can display the confusion matrix for this tagger.
|
||||
|
||||
>>> print(tagger.confusion(gold_data))
|
||||
| - |
|
||||
| N |
|
||||
| O |
|
||||
| N J J N N P P R V V V V V |
|
||||
| ' E C C D I J J J N N N O R R B T V B B B B B W ` |
|
||||
| ' , - . : C D T N J R S N P S S P B R O B D G N P Z P ` |
|
||||
-------+-------------------------------------------------------------------------------------+
|
||||
'' | <3> . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
, | .<11> . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
-NONE- | . . <.> . . . 4 . . 4 . . 7 2 . . . 1 . . . . . . 3 . . . |
|
||||
. | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
: | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
CC | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . |
|
||||
CD | . . . . . .<13> . . . . . . . . . . . . . . . . . . . . . |
|
||||
DT | . . . . . . .<28> . . . . . . . . . . . . . . . . . . . . |
|
||||
IN | . . . . . . . .<34> . . . . . . . . . . . . . . . . . . . |
|
||||
JJ | . . . . . . . . .<10> . . . 1 . . . . 1 . . . . . . . . . |
|
||||
JJR | . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . |
|
||||
JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . |
|
||||
NN | . . . . . . . . . 1 . .<26> . . . . . . . . . . . . . . . |
|
||||
NNP | . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . |
|
||||
NNS | . . . . . . . . . . . . . .<22> . . . . . . . . . . . . . |
|
||||
POS | . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . |
|
||||
PRP | . . . . . . . . . . . . . . . . <3> . . . . . . . . . . . |
|
||||
RB | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . |
|
||||
RBR | . . . . . . . . . . . . . . . . . . <.> . . . . . . . . . |
|
||||
TO | . . . . . . . . . . . . . . . . . . . <2> . . . . . . . . |
|
||||
VB | . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . |
|
||||
VBD | . . . . . . . . . . . . . . . . . . . . .<12> . 1 . . . . |
|
||||
VBG | . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . |
|
||||
VBN | . . . . . . . . . 2 . . . . . . . . . . . 2 . <5> . . . . |
|
||||
VBP | . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . |
|
||||
VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . <2> . . |
|
||||
WP | . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . |
|
||||
`` | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3>|
|
||||
-------+-------------------------------------------------------------------------------------+
|
||||
(row = reference; col = test)
|
||||
<BLANKLINE>
|
||||
|
||||
Brill Trainer with evaluation
|
||||
=============================
|
||||
|
||||
>>> # Perform the relevant imports.
|
||||
>>> from nltk.tbl.template import Template
|
||||
>>> from nltk.tag.brill import Pos, Word
|
||||
>>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger
|
||||
|
||||
>>> # Load some data
|
||||
>>> from nltk.corpus import treebank
|
||||
>>> training_data = treebank.tagged_sents()[:100]
|
||||
>>> baseline_data = treebank.tagged_sents()[100:200]
|
||||
>>> gold_data = treebank.tagged_sents()[200:300]
|
||||
>>> testing_data = [untag(s) for s in gold_data]
|
||||
|
||||
>>> backoff = RegexpTagger([
|
||||
... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
|
||||
... (r'(The|the|A|a|An|an)$', 'AT'), # articles
|
||||
... (r'.*able$', 'JJ'), # adjectives
|
||||
... (r'.*ness$', 'NN'), # nouns formed from adjectives
|
||||
... (r'.*ly$', 'RB'), # adverbs
|
||||
... (r'.*s$', 'NNS'), # plural nouns
|
||||
... (r'.*ing$', 'VBG'), # gerunds
|
||||
... (r'.*ed$', 'VBD'), # past tense verbs
|
||||
... (r'.*', 'NN') # nouns (default)
|
||||
... ])
|
||||
|
||||
We've now created a simple ``RegexpTagger``, which tags according to the regular expression
|
||||
rules it has been supplied. This tagger in and of itself does not have a great accuracy.
|
||||
|
||||
>>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS
|
||||
0.245014...
|
||||
|
||||
Neither does a simple ``UnigramTagger``. This tagger is trained on some data,
|
||||
and will then first try to match unigrams (i.e. tokens) of the sentence it has
|
||||
to tag to the learned data.
|
||||
|
||||
>>> unigram_tagger = UnigramTagger(baseline_data)
|
||||
>>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS
|
||||
0.581196...
|
||||
|
||||
The lackluster accuracy here can be explained with the following example:
|
||||
|
||||
>>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"]) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None), ('to', 'TO'), ('be', 'VB'), ('tagged', None)]
|
||||
|
||||
As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary).
|
||||
The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms.
|
||||
|
||||
In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real
|
||||
baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now
|
||||
the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger``
|
||||
encounters an OOV token.
|
||||
|
||||
>>> baseline = UnigramTagger(baseline_data, backoff=backoff)
|
||||
>>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS
|
||||
0.7537647...
|
||||
|
||||
That is already much better. We can investigate the performance further by running
|
||||
``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure*
|
||||
of each tag.
|
||||
|
||||
>>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True))
|
||||
Tag | Prec. | Recall | F-measure
|
||||
-------+--------+--------+-----------
|
||||
NNP | 0.9674 | 0.2738 | 0.4269
|
||||
NN | 0.4111 | 0.9136 | 0.5670
|
||||
IN | 0.9383 | 0.9580 | 0.9480
|
||||
DT | 0.9819 | 0.8859 | 0.9314
|
||||
JJ | 0.8167 | 0.2970 | 0.4356
|
||||
NNS | 0.7393 | 0.9630 | 0.8365
|
||||
-NONE- | 1.0000 | 0.8345 | 0.9098
|
||||
, | 1.0000 | 1.0000 | 1.0000
|
||||
. | 1.0000 | 1.0000 | 1.0000
|
||||
VBD | 0.6429 | 0.8804 | 0.7431
|
||||
CD | 1.0000 | 0.9872 | 0.9935
|
||||
CC | 1.0000 | 0.9355 | 0.9667
|
||||
VB | 0.7778 | 0.3684 | 0.5000
|
||||
VBN | 0.9375 | 0.3000 | 0.4545
|
||||
RB | 0.7778 | 0.7447 | 0.7609
|
||||
TO | 1.0000 | 1.0000 | 1.0000
|
||||
VBZ | 0.9643 | 0.6429 | 0.7714
|
||||
VBG | 0.6415 | 0.9444 | 0.7640
|
||||
PRP$ | 1.0000 | 1.0000 | 1.0000
|
||||
PRP | 1.0000 | 0.5556 | 0.7143
|
||||
MD | 1.0000 | 1.0000 | 1.0000
|
||||
VBP | 0.6471 | 0.5789 | 0.6111
|
||||
POS | 1.0000 | 1.0000 | 1.0000
|
||||
$ | 1.0000 | 0.8182 | 0.9000
|
||||
'' | 1.0000 | 1.0000 | 1.0000
|
||||
: | 1.0000 | 1.0000 | 1.0000
|
||||
WDT | 0.4000 | 0.2000 | 0.2667
|
||||
`` | 1.0000 | 1.0000 | 1.0000
|
||||
JJR | 1.0000 | 0.5000 | 0.6667
|
||||
NNPS | 0.0000 | 0.0000 | 0.0000
|
||||
RBR | 1.0000 | 1.0000 | 1.0000
|
||||
-LRB- | 0.0000 | 0.0000 | 0.0000
|
||||
-RRB- | 0.0000 | 0.0000 | 0.0000
|
||||
RP | 0.6667 | 0.6667 | 0.6667
|
||||
EX | 0.5000 | 0.5000 | 0.5000
|
||||
JJS | 0.0000 | 0.0000 | 0.0000
|
||||
WP | 1.0000 | 1.0000 | 1.0000
|
||||
PDT | 0.0000 | 0.0000 | 0.0000
|
||||
AT | 0.0000 | 0.0000 | 0.0000
|
||||
<BLANKLINE>
|
||||
|
||||
It's clear that although the precision of tagging `"NNP"` is high, the recall is very low.
|
||||
With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see
|
||||
a similar effect with `"JJ"`.
|
||||
|
||||
We can also see a very expected result: The precision of `"NN"` is low, while the recall
|
||||
is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and
|
||||
``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So,
|
||||
we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"`
|
||||
for many tokens that shouldn't be `"NN"`.
|
||||
|
||||
This method gives us some insight in what parts of the tagger needs more attention, and why.
|
||||
However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually
|
||||
tagged as.
|
||||
To help that, we can create a confusion matrix.
|
||||
|
||||
>>> print(baseline.confusion(gold_data))
|
||||
| - |
|
||||
| - N - |
|
||||
| L O R N P |
|
||||
| R N R J J N N N P P P R R V V V V V W |
|
||||
| ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` |
|
||||
| $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` |
|
||||
-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
$ | <9> . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . |
|
||||
'' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
, | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
-LRB- | . . . <.> . . . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . |
|
||||
-NONE- | . . . .<121> . . . . . . . . . . . . . 24 . . . . . . . . . . . . . . . . . . . . |
|
||||
-RRB- | . . . . . <.> . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . |
|
||||
. | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
: | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
CC | . . . . . . . . . <58> . . . . . . . . 4 . . . . . . . . . . . . . . . . . . . . |
|
||||
CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . |
|
||||
DT | . . . . . . . . 1 . .<163> . 4 . . . . 13 . . . . . . . . . . . . . . . . . 3 . . |
|
||||
EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . |
|
||||
IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . |
|
||||
JJ | . . . . . . . . . . . . . . <49> . . . 86 2 . 4 . . . . 6 . . . . 12 3 . 3 . . . . |
|
||||
JJR | . . . . . . . . . . . . . . . <3> . . 3 . . . . . . . . . . . . . . . . . . . . |
|
||||
JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . |
|
||||
MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . |
|
||||
NN | . . . . . . . . . . . . . . 9 . . .<296> . . 5 . . . . . . . . 5 . 9 . . . . . . |
|
||||
NNP | . . . . . . . . . . . 2 . . . . . . 199 <89> . 26 . . . . 2 . . . . 2 5 . . . . . . |
|
||||
NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . |
|
||||
NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . |
|
||||
PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . |
|
||||
POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . |
|
||||
PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . |
|
||||
PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . |
|
||||
RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . |
|
||||
RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . |
|
||||
RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . |
|
||||
TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . |
|
||||
VB | . . . . . . . . . . . . . . 2 . . . 30 . . . . . . . 1 . . . <21> . . . 3 . . . . |
|
||||
VBD | . . . . . . . . . . . . . . . . . . 10 . . . . . . . . . . . . <81> . 1 . . . . . |
|
||||
VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . |
|
||||
VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 31 . <15> . . . . . |
|
||||
VBP | . . . . . . . . . . . . . . . . . . 7 . . . . . . . . . . . 1 . . . <11> . . . . |
|
||||
VBZ | . . . . . . . . . . . . . . . . . . . . . 15 . . . . . . . . . . . . . <27> . . . |
|
||||
WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . |
|
||||
WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . |
|
||||
`` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>|
|
||||
-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
(row = reference; col = test)
|
||||
<BLANKLINE>
|
||||
|
||||
Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that,
|
||||
we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`.
|
||||
This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be
|
||||
tagged as `"JJ"` are actually tagged as `"NN"` by our tagger.
|
||||
|
||||
This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses
|
||||
templates to attempt to improve the performance of the tagger.
|
||||
|
||||
>>> # Set up templates
|
||||
>>> Template._cleartemplates() #clear any templates created in earlier tests
|
||||
>>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))]
|
||||
|
||||
>>> # Construct a BrillTaggerTrainer
|
||||
>>> tt = BrillTaggerTrainer(baseline, templates, trace=3)
|
||||
>>> tagger1 = tt.train(training_data, max_rules=10)
|
||||
TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None)
|
||||
Finding initial useful rules...
|
||||
Found 618 useful rules.
|
||||
<BLANKLINE>
|
||||
B |
|
||||
S F r O | Score = Fixed - Broken
|
||||
c i o t | R Fixed = num tags changed incorrect -> correct
|
||||
o x k h | u Broken = num tags changed correct -> incorrect
|
||||
r e e e | l Other = num tags changed incorrect -> incorrect
|
||||
e d n r | e
|
||||
------------------+-------------------------------------------------------
|
||||
13 14 1 4 | NN->VB if Pos:TO@[-1]
|
||||
8 8 0 0 | NN->VB if Pos:MD@[-1]
|
||||
7 10 3 22 | NN->IN if Pos:NNS@[-1]
|
||||
5 5 0 0 | NN->VBP if Pos:PRP@[-1]
|
||||
5 5 0 0 | VBD->VBN if Pos:VBZ@[-1]
|
||||
5 5 0 0 | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0]
|
||||
4 4 0 0 | NN->-NONE- if Pos:WP@[-1]
|
||||
4 4 0 3 | NN->NNP if Pos:-NONE-@[-1]
|
||||
4 6 2 2 | NN->NNP if Pos:NNP@[-1]
|
||||
4 4 0 0 | NNS->VBZ if Pos:PRP@[-1]
|
||||
|
||||
>>> tagger1.rules()[1:3]
|
||||
(Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')]))
|
||||
|
||||
>>> tagger1.print_template_statistics(printunused=False)
|
||||
TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules)
|
||||
TRAIN ( 2417 tokens) initial 555 0.7704 final: 496 0.7948
|
||||
#ID | Score (train) | #Rules | Template
|
||||
--------------------------------------------
|
||||
000 | 54 0.915 | 9 0.900 | Template(Pos([-1]))
|
||||
001 | 5 0.085 | 1 0.100 | Template(Pos([-1]),Word([0]))
|
||||
<BLANKLINE>
|
||||
<BLANKLINE>
|
||||
|
||||
>>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS
|
||||
0.769230...
|
||||
|
||||
>>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True))
|
||||
Tag | Prec. | Recall | F-measure
|
||||
-------+--------+--------+-----------
|
||||
NNP | 0.8298 | 0.3600 | 0.5021
|
||||
NN | 0.4435 | 0.8364 | 0.5797
|
||||
IN | 0.8476 | 0.9580 | 0.8994
|
||||
DT | 0.9819 | 0.8859 | 0.9314
|
||||
JJ | 0.8167 | 0.2970 | 0.4356
|
||||
NNS | 0.7464 | 0.9630 | 0.8410
|
||||
-NONE- | 1.0000 | 0.8414 | 0.9139
|
||||
, | 1.0000 | 1.0000 | 1.0000
|
||||
. | 1.0000 | 1.0000 | 1.0000
|
||||
VBD | 0.6723 | 0.8696 | 0.7583
|
||||
CD | 1.0000 | 0.9872 | 0.9935
|
||||
CC | 1.0000 | 0.9355 | 0.9667
|
||||
VB | 0.8103 | 0.8246 | 0.8174
|
||||
VBN | 0.9130 | 0.4200 | 0.5753
|
||||
RB | 0.7778 | 0.7447 | 0.7609
|
||||
TO | 1.0000 | 1.0000 | 1.0000
|
||||
VBZ | 0.9667 | 0.6905 | 0.8056
|
||||
VBG | 0.6415 | 0.9444 | 0.7640
|
||||
PRP$ | 1.0000 | 1.0000 | 1.0000
|
||||
PRP | 1.0000 | 0.5556 | 0.7143
|
||||
MD | 1.0000 | 1.0000 | 1.0000
|
||||
VBP | 0.6316 | 0.6316 | 0.6316
|
||||
POS | 1.0000 | 1.0000 | 1.0000
|
||||
$ | 1.0000 | 0.8182 | 0.9000
|
||||
'' | 1.0000 | 1.0000 | 1.0000
|
||||
: | 1.0000 | 1.0000 | 1.0000
|
||||
WDT | 0.4000 | 0.2000 | 0.2667
|
||||
`` | 1.0000 | 1.0000 | 1.0000
|
||||
JJR | 1.0000 | 0.5000 | 0.6667
|
||||
NNPS | 0.0000 | 0.0000 | 0.0000
|
||||
RBR | 1.0000 | 1.0000 | 1.0000
|
||||
-LRB- | 0.0000 | 0.0000 | 0.0000
|
||||
-RRB- | 0.0000 | 0.0000 | 0.0000
|
||||
RP | 0.6667 | 0.6667 | 0.6667
|
||||
EX | 0.5000 | 0.5000 | 0.5000
|
||||
JJS | 0.0000 | 0.0000 | 0.0000
|
||||
WP | 1.0000 | 1.0000 | 1.0000
|
||||
PDT | 0.0000 | 0.0000 | 0.0000
|
||||
AT | 0.0000 | 0.0000 | 0.0000
|
||||
<BLANKLINE>
|
||||
|
||||
>>> print(tagger1.confusion(gold_data))
|
||||
| - |
|
||||
| - N - |
|
||||
| L O R N P |
|
||||
| R N R J J N N N P P P R R V V V V V W |
|
||||
| ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` |
|
||||
| $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` |
|
||||
-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
$ | <9> . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . |
|
||||
'' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
, | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
-LRB- | . . . <.> . . . . . . . . . 1 . . . . 2 . . . . . . . . . . . . . . . . . . . . |
|
||||
-NONE- | . . . .<122> . . . . . . . . 1 . . . . 22 . . . . . . . . . . . . . . . . . . . . |
|
||||
-RRB- | . . . . . <.> . . . . . . . . . . . . 2 1 . . . . . . . . . . . . . . . . . . . |
|
||||
. | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
: | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . |
|
||||
CC | . . . . . . . . . <58> . . . . . . . . 2 1 . . . . . . . . . . . . . . 1 . . . . |
|
||||
CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . |
|
||||
DT | . . . . . . . . 1 . .<163> . 5 . . . . 12 . . . . . . . . . . . . . . . . . 3 . . |
|
||||
EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . |
|
||||
IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . |
|
||||
JJ | . . . . . . . . . . . . . 4 <49> . . . 79 4 . 4 . . . . 6 . . . 1 12 3 . 3 . . . . |
|
||||
JJR | . . . . . . . . . . . . . 2 . <3> . . 1 . . . . . . . . . . . . . . . . . . . . |
|
||||
JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . |
|
||||
MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . |
|
||||
NN | . . . . . . . . . . . . . 7 9 . . .<271> 16 . 5 . . . . . . . . 7 . 9 . . . . . . |
|
||||
NNP | . . . . . . . . . . . 2 . 7 . . . . 163<117> . 26 . . . . 2 . . . 1 2 5 . . . . . . |
|
||||
NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . |
|
||||
NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . |
|
||||
PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . |
|
||||
POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . |
|
||||
PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . |
|
||||
PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . |
|
||||
RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . |
|
||||
RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . |
|
||||
RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . |
|
||||
TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . |
|
||||
VB | . . . . . . . . . . . . . . 2 . . . 4 . . . . . . . 1 . . . <47> . . . 3 . . . . |
|
||||
VBD | . . . . . . . . . . . . . 1 . . . . 8 1 . . . . . . . . . . . <80> . 2 . . . . . |
|
||||
VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . |
|
||||
VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 25 . <21> . . . . . |
|
||||
VBP | . . . . . . . . . . . . . 2 . . . . 4 . . . . . . . . . . . 1 . . . <12> . . . . |
|
||||
VBZ | . . . . . . . . . . . . . . . . . . . . . 13 . . . . . . . . . . . . . <29> . . . |
|
||||
WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . |
|
||||
WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . |
|
||||
`` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>|
|
||||
-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
(row = reference; col = test)
|
||||
<BLANKLINE>
|
||||
|
||||
>>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data)
|
||||
>>> tagged[33][12:] # doctest: +NORMALIZE_WHITESPACE
|
||||
[('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'), ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')]
|
||||
|
||||
Regression Tests
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Sequential Taggers
|
||||
------------------
|
||||
|
||||
Add tests for:
|
||||
- make sure backoff is being done correctly.
|
||||
- make sure ngram taggers don't use previous sentences for context.
|
||||
- make sure ngram taggers see 'beginning of the sentence' as a
|
||||
unique context
|
||||
- make sure regexp tagger's regexps are tried in order
|
||||
- train on some simple examples, & make sure that the size & the
|
||||
generated models are correct.
|
||||
- make sure cutoff works as intended
|
||||
- make sure that ngram models only exclude contexts covered by the
|
||||
backoff tagger if the backoff tagger gets that context correct at
|
||||
*all* locations.
|
||||
|
||||
|
||||
Regression Testing for issue #1025
|
||||
==================================
|
||||
|
||||
We want to ensure that a RegexpTagger can be created with more than 100 patterns
|
||||
and does not fail with: "AssertionError: sorry, but this version only supports 100 named groups"
|
||||
|
||||
>>> from nltk.tag import RegexpTagger
|
||||
>>> patterns = [(str(i), 'NNP',) for i in range(200)]
|
||||
>>> tagger = RegexpTagger(patterns)
|
||||
|
||||
Regression Testing for issue #2483
|
||||
==================================
|
||||
|
||||
Ensure that tagging with pos_tag (PerceptronTagger) does not throw an IndexError
|
||||
when attempting tagging an empty string. What it must return instead is not
|
||||
strictly defined.
|
||||
|
||||
>>> from nltk.tag import pos_tag
|
||||
>>> pos_tag(['', 'is', 'a', 'beautiful', 'day']) # doctest: +NORMALIZE_WHITESPACE
|
||||
[('', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('beautiful', 'JJ'), ('day', 'NN')]
|
||||
@@ -0,0 +1,446 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
>>> from nltk.tokenize import *
|
||||
|
||||
Regression Tests: NLTKWordTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Tokenizing some test strings.
|
||||
|
||||
>>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
|
||||
>>> word_tokenize(s1)
|
||||
['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.']
|
||||
>>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said."
|
||||
>>> word_tokenize(s2)
|
||||
['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
|
||||
>>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
|
||||
>>> word_tokenize(s3)
|
||||
['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.']
|
||||
>>> s4 = "I cannot cannot work under these conditions!"
|
||||
>>> word_tokenize(s4)
|
||||
['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!']
|
||||
>>> s5 = "The company spent $30,000,000 last year."
|
||||
>>> word_tokenize(s5)
|
||||
['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.']
|
||||
>>> s6 = "The company spent 40.75% of its income last year."
|
||||
>>> word_tokenize(s6)
|
||||
['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.']
|
||||
>>> s7 = "He arrived at 3:00 pm."
|
||||
>>> word_tokenize(s7)
|
||||
['He', 'arrived', 'at', '3:00', 'pm', '.']
|
||||
>>> s8 = "I bought these items: books, pencils, and pens."
|
||||
>>> word_tokenize(s8)
|
||||
['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.']
|
||||
>>> s9 = "Though there were 150, 100 of them were old."
|
||||
>>> word_tokenize(s9)
|
||||
['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.']
|
||||
>>> s10 = "There were 300,000, but that wasn't enough."
|
||||
>>> word_tokenize(s10)
|
||||
['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.']
|
||||
>>> s11 = "It's more'n enough."
|
||||
>>> word_tokenize(s11)
|
||||
['It', "'s", 'more', "'n", 'enough', '.']
|
||||
|
||||
Gathering the spans of the tokenized strings.
|
||||
|
||||
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).'''
|
||||
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
|
||||
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
|
||||
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
|
||||
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
|
||||
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
|
||||
True
|
||||
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
|
||||
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
|
||||
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
|
||||
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
|
||||
True
|
||||
|
||||
>>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."'''
|
||||
>>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12),
|
||||
... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36),
|
||||
... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62),
|
||||
... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82),
|
||||
... (82, 83), (83, 84)]
|
||||
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
|
||||
True
|
||||
>>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to',
|
||||
... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost',
|
||||
... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"']
|
||||
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
|
||||
True
|
||||
|
||||
Testing improvement made to the TreebankWordTokenizer
|
||||
|
||||
>>> sx1 = '\xabNow that I can do.\xbb'
|
||||
>>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb']
|
||||
>>> word_tokenize(sx1) == expected
|
||||
True
|
||||
>>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.'
|
||||
>>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.']
|
||||
>>> word_tokenize(sx2) == expected
|
||||
True
|
||||
|
||||
|
||||
Testing treebank's detokenizer
|
||||
|
||||
>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
|
||||
>>> detokenizer = TreebankWordDetokenizer()
|
||||
>>> s = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.'
|
||||
>>> s = "\"We beat some pretty good teams to get here,\" Slocum said."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'"We beat some pretty good teams to get here," Slocum said.'
|
||||
>>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.'
|
||||
>>> s = "I cannot cannot work under these conditions!"
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'I cannot cannot work under these conditions!'
|
||||
>>> s = "The company spent $30,000,000 last year."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'The company spent $30,000,000 last year.'
|
||||
>>> s = "The company spent 40.75% of its income last year."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'The company spent 40.75% of its income last year.'
|
||||
>>> s = "He arrived at 3:00 pm."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'He arrived at 3:00 pm.'
|
||||
>>> s = "I bought these items: books, pencils, and pens."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'I bought these items: books, pencils, and pens.'
|
||||
>>> s = "Though there were 150, 100 of them were old."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'Though there were 150, 100 of them were old.'
|
||||
>>> s = "There were 300,000, but that wasn't enough."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
"There were 300,000, but that wasn't enough."
|
||||
>>> s = 'How "are" you?'
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'How "are" you?'
|
||||
>>> s = "Hello (world)"
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'Hello (world)'
|
||||
>>> s = '<A sentence> with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").'
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'<A sentence> with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").'
|
||||
>>> s = "Sentence ending with (parentheses)"
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'Sentence ending with (parentheses)'
|
||||
>>> s = "(Sentence) starting with parentheses."
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
'(Sentence) starting with parentheses.'
|
||||
>>> s = "I've"
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
"I've"
|
||||
>>> s = "Don't"
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
"Don't"
|
||||
>>> s = "I'd"
|
||||
>>> detokenizer.detokenize(word_tokenize(s))
|
||||
"I'd"
|
||||
|
||||
|
||||
Sentence tokenization in word_tokenize:
|
||||
|
||||
>>> s11 = "I called Dr. Jones. I called Dr. Jones."
|
||||
>>> word_tokenize(s11)
|
||||
['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.']
|
||||
>>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen "
|
||||
... "Kuchen einzukaufen. Ich muss.")
|
||||
>>> word_tokenize(s12)
|
||||
['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw',
|
||||
'.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.']
|
||||
>>> word_tokenize(s12, 'german')
|
||||
['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.',
|
||||
'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.']
|
||||
|
||||
|
||||
Regression Tests: Regexp Tokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Some additional test strings.
|
||||
|
||||
>>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n"
|
||||
... "two of them.\n\nThanks.")
|
||||
>>> s2 = ("Alas, it has not rained today. When, do you think, "
|
||||
... "will it rain again?")
|
||||
>>> s3 = ("<p>Although this is <b>not</b> the case here, we must "
|
||||
... "not relax our vigilance!</p>")
|
||||
|
||||
>>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False)
|
||||
[', ', '. ', ', ', ', ', '?']
|
||||
>>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)
|
||||
['Alas', 'it has not rained today', 'When', 'do you think',
|
||||
'will it rain again']
|
||||
|
||||
Take care to avoid using capturing groups:
|
||||
|
||||
>>> regexp_tokenize(s3, r'</?[bp]>', gaps=False)
|
||||
['<p>', '<b>', '</b>', '</p>']
|
||||
>>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=False)
|
||||
['<p>', '<b>', '</b>', '</p>']
|
||||
>>> regexp_tokenize(s3, r'</?(?:b|p)>', gaps=True)
|
||||
['Although this is ', 'not',
|
||||
' the case here, we must not relax our vigilance!']
|
||||
|
||||
Named groups are capturing groups, and confuse the tokenizer:
|
||||
|
||||
>>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=False)
|
||||
['p', 'b', 'b', 'p']
|
||||
>>> regexp_tokenize(s3, r'</?(?P<named>b|p)>', gaps=True)
|
||||
['p', 'Although this is ', 'b', 'not', 'b',
|
||||
' the case here, we must not relax our vigilance!', 'p']
|
||||
|
||||
Make sure that nested groups don't confuse the tokenizer:
|
||||
|
||||
>>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False)
|
||||
['las', 'has', 'rai', 'rai']
|
||||
>>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True)
|
||||
['A', ', it ', ' not ', 'ned today. When, do you think, will it ',
|
||||
'n again?']
|
||||
|
||||
Back-references require capturing groups, and these are not supported:
|
||||
|
||||
>>> regexp_tokenize("aabbbcccc", r'(.)\1')
|
||||
['a', 'b', 'c', 'c']
|
||||
|
||||
A simple sentence tokenizer '\.(\s+|$)'
|
||||
|
||||
>>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True)
|
||||
['Good muffins cost $3.88\nin New York',
|
||||
'Please buy me\ntwo of them', 'Thanks']
|
||||
|
||||
|
||||
Regression Tests: TweetTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks.
|
||||
|
||||
>>> from nltk.tokenize import TweetTokenizer
|
||||
>>> tknzr = TweetTokenizer()
|
||||
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
|
||||
>>> tknzr.tokenize(s0)
|
||||
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
|
||||
>>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)"
|
||||
>>> tknzr.tokenize(s1)
|
||||
['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)']
|
||||
>>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn"
|
||||
>>> tknzr.tokenize(s2)
|
||||
['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn']
|
||||
>>> s3 = "@Insanomania They do... Their mentality doesn't :("
|
||||
>>> tknzr.tokenize(s3)
|
||||
['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':(']
|
||||
>>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!"
|
||||
>>> tknzr.tokenize(s4)
|
||||
['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!']
|
||||
>>> tknzr = TweetTokenizer(reduce_len=True)
|
||||
>>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :("
|
||||
>>> tknzr.tokenize(s5)
|
||||
['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':(']
|
||||
|
||||
It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3.
|
||||
|
||||
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
|
||||
>>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!'
|
||||
>>> tknzr.tokenize(s6)
|
||||
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
|
||||
>>> s7 = '@_willy65: No place for @chuck tonight. Sorry.'
|
||||
>>> tknzr.tokenize(s7)
|
||||
[':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.']
|
||||
>>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.'
|
||||
>>> tknzr.tokenize(s8)
|
||||
['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.']
|
||||
|
||||
The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected:
|
||||
|
||||
>>> tknzr = TweetTokenizer(preserve_case=False)
|
||||
>>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P"
|
||||
>>> tknzr.tokenize(s9)
|
||||
['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P']
|
||||
|
||||
It should not hang on long sequences of the same punctuation character.
|
||||
|
||||
>>> tknzr = TweetTokenizer()
|
||||
>>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L"
|
||||
>>> tknzr.tokenize(s10)
|
||||
['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L']
|
||||
|
||||
Tokenizing multiple sentences at once:
|
||||
|
||||
>>> tknzr = TweetTokenizer()
|
||||
>>> sentences = [
|
||||
... "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--",
|
||||
... "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P",
|
||||
... "@_willy65: No place for @chuck tonight. Sorry."
|
||||
... ]
|
||||
>>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE
|
||||
[['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'],
|
||||
['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'],
|
||||
['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']]
|
||||
|
||||
|
||||
Regression Tests: PunktSentenceTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The sentence splitter should remove whitespace following the sentence boundary.
|
||||
|
||||
>>> pst = PunktSentenceTokenizer()
|
||||
>>> pst.tokenize('See Section 3). Or Section 2). ')
|
||||
['See Section 3).', 'Or Section 2).']
|
||||
>>> pst.tokenize('See Section 3.) Or Section 2.) ')
|
||||
['See Section 3.)', 'Or Section 2.)']
|
||||
>>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False)
|
||||
['See Section 3.', ') Or Section 2.', ')']
|
||||
|
||||
|
||||
Two instances of PunktSentenceTokenizer should not share PunktParameters.
|
||||
|
||||
>>> pst = PunktSentenceTokenizer()
|
||||
>>> pst2 = PunktSentenceTokenizer()
|
||||
>>> pst._params is pst2._params
|
||||
False
|
||||
|
||||
Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067
|
||||
|
||||
>>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer
|
||||
>>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters
|
||||
>>> pbc = PunktBaseClass(lang_vars=None, params=None)
|
||||
>>> type(pbc._params)
|
||||
<class 'nltk.tokenize.punkt.PunktParameters'>
|
||||
>>> type(pbc._lang_vars)
|
||||
<class 'nltk.tokenize.punkt.PunktLanguageVars'>
|
||||
>>> pt = PunktTrainer(lang_vars=None)
|
||||
>>> type(pt._lang_vars)
|
||||
<class 'nltk.tokenize.punkt.PunktLanguageVars'>
|
||||
>>> pst = PunktSentenceTokenizer(lang_vars=None)
|
||||
>>> type(pst._lang_vars)
|
||||
<class 'nltk.tokenize.punkt.PunktLanguageVars'>
|
||||
|
||||
Testing that inputs can start with dots.
|
||||
|
||||
>>> pst = PunktSentenceTokenizer(lang_vars=None)
|
||||
>>> pst.tokenize(". This input starts with a dot. This used to cause issues.")
|
||||
['.', 'This input starts with a dot.', 'This used to cause issues.']
|
||||
|
||||
Regression Tests: align_tokens
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Post-hoc alignment of tokens with a source string
|
||||
|
||||
>>> from nltk.tokenize.util import align_tokens
|
||||
>>> list(align_tokens([''], ""))
|
||||
[(0, 0)]
|
||||
>>> list(align_tokens([''], " "))
|
||||
[(0, 0)]
|
||||
>>> list(align_tokens([], ""))
|
||||
[]
|
||||
>>> list(align_tokens([], " "))
|
||||
[]
|
||||
>>> list(align_tokens(['a'], "a"))
|
||||
[(0, 1)]
|
||||
>>> list(align_tokens(['abc', 'def'], "abcdef"))
|
||||
[(0, 3), (3, 6)]
|
||||
>>> list(align_tokens(['abc', 'def'], "abc def"))
|
||||
[(0, 3), (4, 7)]
|
||||
>>> list(align_tokens(['ab', 'cd'], "ab cd ef"))
|
||||
[(0, 2), (3, 5)]
|
||||
>>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef"))
|
||||
[(0, 2), (3, 5), (6, 8)]
|
||||
>>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef"))
|
||||
Traceback (most recent call last):
|
||||
....
|
||||
ValueError: substring "efg" not found in "ab cd ef"
|
||||
>>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef"))
|
||||
Traceback (most recent call last):
|
||||
....
|
||||
ValueError: substring "gh" not found in "ab cd ef"
|
||||
>>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday."))
|
||||
[(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)]
|
||||
|
||||
|
||||
Regression Tests: MWETokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
Pickle an MWETokenizer
|
||||
|
||||
>>> from nltk.tokenize import MWETokenizer
|
||||
>>> import pickle
|
||||
|
||||
>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
|
||||
>>> p = pickle.dumps(tokenizer)
|
||||
>>> unpickeled = pickle.loads(p)
|
||||
>>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split())
|
||||
['An', "hors+d'oeuvre", 'tonight,', 'sir?']
|
||||
|
||||
|
||||
Regression Tests: TextTilingTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
TextTilingTokenizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm.
|
||||
|
||||
>>> from nltk.tokenize import TextTilingTokenizer
|
||||
>>> from nltk.corpus import brown
|
||||
>>> tt = TextTilingTokenizer()
|
||||
>>> tt.tokenize(brown.raw()[0:1000])
|
||||
["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"]
|
||||
|
||||
Test that `ValueError` exceptions are raised when illegal arguments are used.
|
||||
|
||||
>>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Similarity method foo not recognized
|
||||
>>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: Smoothing method bar not recognized
|
||||
|
||||
|
||||
Regression Tests: ToktokTokenizer
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
>>> toktok = ToktokTokenizer()
|
||||
>>> text = u'Is 9.5 or 525,600 my favorite number?'
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
Is 9.5 or 525,600 my favorite number ?
|
||||
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
|
||||
>>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
|
||||
>>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf'
|
||||
>>> assert toktok.tokenize(text, return_str=True) == expected
|
||||
>>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf']
|
||||
True
|
||||
|
||||
Taking comments from the code and turning them into actual tests...
|
||||
|
||||
# Don't tokenize period unless it ends the line and that it isn't
|
||||
# preceded by another period, e.g.
|
||||
# "something ..." -> "something ..."
|
||||
>>> text = "something ..."
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
something ...
|
||||
|
||||
# "something." -> "something ."
|
||||
>>> text = "something."
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
something .
|
||||
|
||||
# Don't tokenize period unless it ends the line eg.
|
||||
# " ... stuff." -> "... stuff ."
|
||||
>>> text = "also more ... stuff."
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
also more ... stuff .
|
||||
|
||||
Demonstrate that the "FUNKY_PUNCT_1" and "FUNKY_PUNCT_2" patterns do what
|
||||
they're supposed to do. For example, FUNKY_PUNCT_1 splits out inverted question
|
||||
marks.
|
||||
>>> text = "¿Quieres una taza de café?"
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
¿ Quieres una taza de café ?
|
||||
|
||||
This one would have failed without the FUNKY_PUNCT_2 pattern included.
|
||||
>>> text = "«Sí, por favor.»"
|
||||
>>> print(toktok.tokenize(text, return_str=True))
|
||||
« Sí , por favor . »
|
||||
@@ -0,0 +1,306 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
===============================
|
||||
Unit test cases for ``toolbox``
|
||||
===============================
|
||||
|
||||
>>> from nltk import toolbox
|
||||
|
||||
--------------------------
|
||||
``toolbox.StandardFormat``
|
||||
--------------------------
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
|
||||
``toolbox.StandardFormat.open()``
|
||||
---------------------------------
|
||||
>>> import os, tempfile
|
||||
>>> (fd, fname) = tempfile.mkstemp()
|
||||
>>> tf = os.fdopen(fd, "w")
|
||||
>>> _ = tf.write('\\lx a value\n\\lx another value\n')
|
||||
>>> tf.close()
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open(fname)
|
||||
>>> list(f.fields())
|
||||
[('lx', 'a value'), ('lx', 'another value')]
|
||||
>>> f.close()
|
||||
>>> os.unlink(fname)
|
||||
|
||||
``toolbox.StandardFormat.open_string()``
|
||||
----------------------------------------
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\n\\lx another value\n')
|
||||
>>> list(f.fields())
|
||||
[('lx', 'a value'), ('lx', 'another value')]
|
||||
>>> f.close()
|
||||
|
||||
``toolbox.StandardFormat.close()``
|
||||
----------------------------------
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\n\\lx another value\n')
|
||||
>>> list(f.fields())
|
||||
[('lx', 'a value'), ('lx', 'another value')]
|
||||
>>> f.close()
|
||||
|
||||
``toolbox.StandardFormat.line_num``
|
||||
---------------------------------------
|
||||
|
||||
``StandardFormat.line_num`` contains the line number of the last line returned:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n')
|
||||
>>> line_nums = []
|
||||
>>> for l in f.raw_fields():
|
||||
... line_nums.append(f.line_num)
|
||||
>>> line_nums
|
||||
[1, 2, 3]
|
||||
|
||||
``StandardFormat.line_num`` contains the line number of the last line returned:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
|
||||
>>> line_nums = []
|
||||
>>> for l in f.raw_fields():
|
||||
... line_nums.append(f.line_num)
|
||||
>>> line_nums
|
||||
[2, 5, 7]
|
||||
|
||||
``StandardFormat.line_num`` doesn't exist before opening or after closing
|
||||
a file or string:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.line_num
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AttributeError: 'StandardFormat' object has no attribute 'line_num'
|
||||
>>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n')
|
||||
>>> line_nums = []
|
||||
>>> for l in f.raw_fields():
|
||||
... line_nums.append(f.line_num)
|
||||
>>> line_nums
|
||||
[2, 5, 7]
|
||||
>>> f.close()
|
||||
>>> f.line_num
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
AttributeError: 'StandardFormat' object has no attribute 'line_num'
|
||||
|
||||
``toolbox.StandardFormat.raw_fields()``
|
||||
---------------------------------------
|
||||
``raw_fields()`` returns an iterator over tuples of two strings representing the
|
||||
marker and its value. The marker is given without the backslash and the value
|
||||
without its trailing newline:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\n\\lx another value\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'a value'), ('lx', 'another value')]
|
||||
|
||||
an empty file returns nothing:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('')
|
||||
>>> list(f.raw_fields())
|
||||
[]
|
||||
|
||||
file with only a newline returns WHAT SHOULD IT RETURN???:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\n')
|
||||
>>> list(f.raw_fields())
|
||||
[(None, '')]
|
||||
|
||||
file with only one field should be parsed ok:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx one value\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'one value')]
|
||||
|
||||
file without a trailing newline should be parsed ok:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\n\\lx another value')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'a value'), ('lx', 'another value')]
|
||||
|
||||
trailing white space is preserved except for the final newline:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')]
|
||||
|
||||
line wrapping is preserved:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
|
||||
|
||||
file beginning with a multiline record should be parsed ok:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')]
|
||||
|
||||
file ending with a multiline record should be parsed ok:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')]
|
||||
|
||||
file beginning with a BOM should be parsed ok:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'a value'), ('lx', 'another value')]
|
||||
|
||||
file beginning with two BOMs should ignore only the first one:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n')
|
||||
>>> list(f.raw_fields())
|
||||
[(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')]
|
||||
|
||||
should not ignore a BOM not at the beginning of the file:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('lx', 'a value\n\xef\xbb\xbf\\lx another value')]
|
||||
|
||||
``toolbox.StandardFormat.fields()``
|
||||
-----------------------------------
|
||||
trailing white space is not preserved:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n')
|
||||
>>> list(f.fields())
|
||||
[('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')]
|
||||
|
||||
multiline fields are unwrapped:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n')
|
||||
>>> list(f.fields())
|
||||
[('lx', 'a value more of the value and still more'), ('lc', 'another val')]
|
||||
|
||||
markers
|
||||
-------
|
||||
A backslash in the first position on a new line indicates the start of a
|
||||
marker. The backslash is not part of the marker:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\mk a value\n')
|
||||
>>> list(f.fields())
|
||||
[('mk', 'a value')]
|
||||
|
||||
If the backslash occurs later in the line it does not indicate the start
|
||||
of a marker:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\mk a value\n \\mk another one\n')
|
||||
>>> list(f.raw_fields())
|
||||
[('mk', 'a value\n \\mk another one')]
|
||||
|
||||
There is no specific limit to the length of a marker:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\this_is_an_extremely_long_marker value\n')
|
||||
>>> list(f.fields())
|
||||
[('this_is_an_extremely_long_marker', 'value')]
|
||||
|
||||
A marker can contain any non white space character:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789 value\n')
|
||||
>>> list(f.fields())
|
||||
[('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')]
|
||||
|
||||
A marker is terminated by any white space character:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one')
|
||||
>>> list(f.fields())
|
||||
[('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')]
|
||||
|
||||
Consecutive whitespace characters (except newline) are treated the same as one:
|
||||
|
||||
>>> f = toolbox.StandardFormat()
|
||||
>>> f.open_string('\\mk \t\r\fa value\n')
|
||||
>>> list(f.fields())
|
||||
[('mk', 'a value')]
|
||||
|
||||
-----------------------
|
||||
``toolbox.ToolboxData``
|
||||
-----------------------
|
||||
|
||||
>>> db = toolbox.ToolboxData()
|
||||
|
||||
``toolbox.ToolboxData.parse()``
|
||||
-------------------------------
|
||||
check that normal parsing works:
|
||||
|
||||
>>> from xml.etree import ElementTree
|
||||
>>> td = toolbox.ToolboxData()
|
||||
>>> s = """\\_sh v3.0 400 Rotokas Dictionary
|
||||
... \\_DateStampHasFourDigitYear
|
||||
...
|
||||
... \\lx kaa
|
||||
... \\ps V.A
|
||||
... \\ge gag
|
||||
... \\gp nek i pas
|
||||
...
|
||||
... \\lx kaa
|
||||
... \\ps V.B
|
||||
... \\ge strangle
|
||||
... \\gp pasim nek
|
||||
... """
|
||||
>>> td.open_string(s)
|
||||
>>> tree = td.parse(key='lx')
|
||||
>>> tree.tag
|
||||
'toolbox_data'
|
||||
>>> ElementTree.tostring(list(tree)[0]).decode('utf8')
|
||||
'<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
|
||||
>>> ElementTree.tostring(list(tree)[1]).decode('utf8')
|
||||
'<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
|
||||
>>> ElementTree.tostring(list(tree)[2]).decode('utf8')
|
||||
'<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
|
||||
|
||||
check that guessing the key marker works:
|
||||
|
||||
>>> from xml.etree import ElementTree
|
||||
>>> td = toolbox.ToolboxData()
|
||||
>>> s = """\\_sh v3.0 400 Rotokas Dictionary
|
||||
... \\_DateStampHasFourDigitYear
|
||||
...
|
||||
... \\lx kaa
|
||||
... \\ps V.A
|
||||
... \\ge gag
|
||||
... \\gp nek i pas
|
||||
...
|
||||
... \\lx kaa
|
||||
... \\ps V.B
|
||||
... \\ge strangle
|
||||
... \\gp pasim nek
|
||||
... """
|
||||
>>> td.open_string(s)
|
||||
>>> tree = td.parse()
|
||||
>>> ElementTree.tostring(list(tree)[0]).decode('utf8')
|
||||
'<header><_sh>v3.0 400 Rotokas Dictionary</_sh><_DateStampHasFourDigitYear /></header>'
|
||||
>>> ElementTree.tostring(list(tree)[1]).decode('utf8')
|
||||
'<record><lx>kaa</lx><ps>V.A</ps><ge>gag</ge><gp>nek i pas</gp></record>'
|
||||
>>> ElementTree.tostring(list(tree)[2]).decode('utf8')
|
||||
'<record><lx>kaa</lx><ps>V.B</ps><ge>strangle</ge><gp>pasim nek</gp></record>'
|
||||
|
||||
-----------------------
|
||||
``toolbox`` functions
|
||||
-----------------------
|
||||
|
||||
``toolbox.to_sfm_string()``
|
||||
-------------------------------
|
||||
@@ -0,0 +1,240 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
.. -*- coding: utf-8 -*-
|
||||
|
||||
=========
|
||||
Alignment
|
||||
=========
|
||||
|
||||
Corpus Reader
|
||||
-------------
|
||||
|
||||
>>> from nltk.corpus import comtrans
|
||||
>>> words = comtrans.words('alignment-en-fr.txt')
|
||||
>>> for word in words[:6]:
|
||||
... print(word)
|
||||
Resumption
|
||||
of
|
||||
the
|
||||
session
|
||||
I
|
||||
declare
|
||||
>>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0]
|
||||
>>> als
|
||||
AlignedSent(['Resumption', 'of', 'the', 'session'],
|
||||
['Reprise', 'de', 'la', 'session'],
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]))
|
||||
|
||||
|
||||
Alignment Objects
|
||||
-----------------
|
||||
|
||||
Aligned sentences are simply a mapping between words in a sentence:
|
||||
|
||||
>>> print(" ".join(als.words))
|
||||
Resumption of the session
|
||||
>>> print(" ".join(als.mots))
|
||||
Reprise de la session
|
||||
>>> als.alignment
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])
|
||||
|
||||
|
||||
Usually we look at them from the perspective of a source to a target language,
|
||||
but they are easily inverted:
|
||||
|
||||
>>> als.invert()
|
||||
AlignedSent(['Reprise', 'de', 'la', 'session'],
|
||||
['Resumption', 'of', 'the', 'session'],
|
||||
Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]))
|
||||
|
||||
|
||||
We can create new alignments, but these need to be in the correct range of
|
||||
the corresponding sentences:
|
||||
|
||||
>>> from nltk.translate import Alignment, AlignedSent
|
||||
>>> als = AlignedSent(['Reprise', 'de', 'la', 'session'],
|
||||
... ['Resumption', 'of', 'the', 'session'],
|
||||
... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)]))
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
IndexError: Alignment is outside boundary of mots
|
||||
|
||||
|
||||
You can set alignments with any sequence of tuples, so long as the first two
|
||||
indexes of the tuple are the alignment indices:
|
||||
|
||||
>>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
|
||||
|
||||
>>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))])
|
||||
Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))])
|
||||
|
||||
|
||||
Alignment Algorithms
|
||||
--------------------
|
||||
|
||||
EM for IBM Model 1
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Here is an example from Koehn, 2010:
|
||||
|
||||
>>> from nltk.translate import IBMModel1
|
||||
>>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']),
|
||||
... AlignedSent(['the', 'book'], ['das', 'Buch']),
|
||||
... AlignedSent(['a', 'book'], ['ein', 'Buch'])]
|
||||
>>> em_ibm1 = IBMModel1(corpus, 20)
|
||||
>>> print(round(em_ibm1.translation_table['the']['das'], 1))
|
||||
1.0
|
||||
>>> print(round(em_ibm1.translation_table['book']['das'], 1))
|
||||
0.0
|
||||
>>> print(round(em_ibm1.translation_table['house']['das'], 1))
|
||||
0.0
|
||||
>>> print(round(em_ibm1.translation_table['the']['Buch'], 1))
|
||||
0.0
|
||||
>>> print(round(em_ibm1.translation_table['book']['Buch'], 1))
|
||||
1.0
|
||||
>>> print(round(em_ibm1.translation_table['a']['Buch'], 1))
|
||||
0.0
|
||||
>>> print(round(em_ibm1.translation_table['book']['ein'], 1))
|
||||
0.0
|
||||
>>> print(round(em_ibm1.translation_table['a']['ein'], 1))
|
||||
1.0
|
||||
>>> print(round(em_ibm1.translation_table['the']['Haus'], 1))
|
||||
0.0
|
||||
>>> print(round(em_ibm1.translation_table['house']['Haus'], 1))
|
||||
1.0
|
||||
>>> print(round(em_ibm1.translation_table['book'][None], 1))
|
||||
0.5
|
||||
|
||||
And using an NLTK corpus. We train on only 10 sentences, since it is so slow:
|
||||
|
||||
>>> from nltk.corpus import comtrans
|
||||
>>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20)
|
||||
>>> print(round(com_ibm1.translation_table['bitte']['Please'], 1))
|
||||
0.2
|
||||
>>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1))
|
||||
1.0
|
||||
|
||||
|
||||
Evaluation
|
||||
----------
|
||||
The evaluation metrics for alignments are usually not interested in the
|
||||
contents of alignments but more often the comparison to a "gold standard"
|
||||
alignment that has been been constructed by human experts. For this reason we
|
||||
often want to work just with raw set operations against the alignment points.
|
||||
This then gives us a very clean form for defining our evaluation metrics.
|
||||
|
||||
.. Note::
|
||||
The AlignedSent class has no distinction of "possible" or "sure"
|
||||
alignments. Thus all alignments are treated as "sure".
|
||||
|
||||
Consider the following aligned sentence for evaluation:
|
||||
|
||||
>>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'],
|
||||
... ['Reprise', 'de', 'la', 'session'],
|
||||
... Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)]))
|
||||
|
||||
Precision
|
||||
~~~~~~~~~
|
||||
``precision = |A∩P| / |A|``
|
||||
|
||||
**Precision** is probably the most well known evaluation metric and it is implemented
|
||||
in `nltk.metrics.scores.precision`_. Since precision is simply interested in the
|
||||
proportion of correct alignments, we calculate the ratio of the number of our
|
||||
test alignments (*A*) that match a possible alignment (*P*), over the number of
|
||||
test alignments provided. There is no penalty for missing a possible alignment
|
||||
in our test alignments. An easy way to game this metric is to provide just one
|
||||
test alignment that is in *P* [OCH2000]_.
|
||||
|
||||
Here are some examples:
|
||||
|
||||
>>> from nltk.metrics import precision
|
||||
>>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)])
|
||||
>>> precision(Alignment([]), als.alignment)
|
||||
0.0
|
||||
>>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
|
||||
1.0
|
||||
>>> precision(Alignment([(0,0), (3,3)]), als.alignment)
|
||||
0.5
|
||||
>>> precision(Alignment.fromstring('0-0 3-3'), als.alignment)
|
||||
0.5
|
||||
>>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment)
|
||||
1.0
|
||||
>>> precision(als.alignment, my_als.alignment)
|
||||
0.6
|
||||
|
||||
|
||||
.. _nltk.metrics.scores.precision:
|
||||
https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.precision
|
||||
|
||||
|
||||
Recall
|
||||
~~~~~~
|
||||
``recall = |A∩S| / |S|``
|
||||
|
||||
**Recall** is another well known evaluation metric that has a set based
|
||||
implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is
|
||||
simply interested in the proportion of found alignments, we calculate the
|
||||
ratio of the number of our test alignments (*A*) that match a sure alignment
|
||||
(*S*) over the number of sure alignments. There is no penalty for producing
|
||||
a lot of test alignments. An easy way to game this metric is to include every
|
||||
possible alignment in our test alignments, regardless if they are correct or
|
||||
not [OCH2000]_.
|
||||
|
||||
Here are some examples:
|
||||
|
||||
>>> from nltk.metrics import recall
|
||||
>>> print(recall(Alignment([]), als.alignment))
|
||||
None
|
||||
>>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
|
||||
1.0
|
||||
>>> recall(Alignment.fromstring('0-0 3-3'), als.alignment)
|
||||
1.0
|
||||
>>> recall(Alignment([(0,0), (3,3)]), als.alignment)
|
||||
1.0
|
||||
>>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment)
|
||||
0.66666...
|
||||
>>> recall(als.alignment, my_als.alignment)
|
||||
0.75
|
||||
|
||||
|
||||
.. _nltk.metrics.scores.recall:
|
||||
https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.recall
|
||||
|
||||
|
||||
Alignment Error Rate (AER)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
``AER = 1 - (|A∩S| + |A∩P|) / (|A| + |S|)``
|
||||
|
||||
**Alignment Error Rate** is commonly used metric for assessing sentence
|
||||
alignments. It combines precision and recall metrics together such that a
|
||||
perfect alignment must have all of the sure alignments and may have some
|
||||
possible alignments [MIHALCEA2003]_ [KOEHN2010]_.
|
||||
|
||||
.. Note::
|
||||
[KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)``
|
||||
in his book, but corrects it to the above in his online errata. This is
|
||||
in line with [MIHALCEA2003]_.
|
||||
|
||||
Here are some examples:
|
||||
|
||||
>>> from nltk.translate import alignment_error_rate
|
||||
>>> alignment_error_rate(Alignment([]), als.alignment)
|
||||
1.0
|
||||
>>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment)
|
||||
0.0
|
||||
>>> alignment_error_rate(als.alignment, my_als.alignment)
|
||||
0.333333...
|
||||
>>> alignment_error_rate(als.alignment, my_als.alignment,
|
||||
... als.alignment | Alignment([(1,2), (2,1)]))
|
||||
0.222222...
|
||||
|
||||
|
||||
.. [OCH2000] Och, F. and Ney, H. (2000)
|
||||
*Statistical Machine Translation*, EAMT Workshop
|
||||
|
||||
.. [MIHALCEA2003] Mihalcea, R. and Pedersen, T. (2003)
|
||||
*An evaluation exercise for word alignment*, HLT-NAACL 2003
|
||||
|
||||
.. [KOEHN2010] Koehn, P. (2010)
|
||||
*Statistical Machine Translation*, Cambridge University Press
|
||||
1223
Backend/venv/lib/python3.12/site-packages/nltk/test/tree.doctest
Normal file
1223
Backend/venv/lib/python3.12/site-packages/nltk/test/tree.doctest
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,177 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
=========================================================
|
||||
Unit tests for nltk.tree.prettyprinter.TreePrettyPrinter
|
||||
=========================================================
|
||||
|
||||
>>> from nltk.tree import Tree, TreePrettyPrinter
|
||||
|
||||
Tree nr 2170 from nltk.corpus.treebank:
|
||||
|
||||
>>> tree = Tree.fromstring(
|
||||
... '(S (NP-SBJ (PRP I)) (VP (VBP feel) (ADJP-PRD (RB pretty) '
|
||||
... '(JJ good)) (PP-CLR (IN about) (NP (PRP it)))) (. .))')
|
||||
>>> tpp = TreePrettyPrinter(tree)
|
||||
>>> print(tpp.text())
|
||||
S
|
||||
__________________________|_____________________
|
||||
| VP |
|
||||
| ____________________|___________ |
|
||||
| | | PP-CLR |
|
||||
| | | _____|_____ |
|
||||
NP-SBJ | ADJP-PRD | NP |
|
||||
| | _______|______ | | |
|
||||
PRP VBP RB JJ IN PRP .
|
||||
| | | | | | |
|
||||
I feel pretty good about it .
|
||||
|
||||
>>> print(tpp.text(unicodelines=True))
|
||||
S
|
||||
┌──────────────────────────┼─────────────────────┐
|
||||
│ VP │
|
||||
│ ┌─────────────┬──────┴───────────┐ │
|
||||
│ │ │ PP-CLR │
|
||||
│ │ │ ┌─────┴─────┐ │
|
||||
NP-SBJ │ ADJP-PRD │ NP │
|
||||
│ │ ┌───────┴──────┐ │ │ │
|
||||
PRP VBP RB JJ IN PRP .
|
||||
│ │ │ │ │ │ │
|
||||
I feel pretty good about it .
|
||||
|
||||
A tree with long labels:
|
||||
|
||||
>>> tree = Tree.fromstring(
|
||||
... '(sentence (plural-noun-phrase (plural-noun Superconductors)) '
|
||||
... '(verb-phrase (plural-verb conduct) '
|
||||
... '(noun-phrase (singular-noun electricity))))')
|
||||
>>> tpp = TreePrettyPrinter(tree)
|
||||
>>> print(tpp.text(abbreviate=8, nodedist=2))
|
||||
sentence
|
||||
__________|__________
|
||||
| verb-phr.
|
||||
| __________|__________
|
||||
plural-n. | noun-phr.
|
||||
| | |
|
||||
plural-n. plural-v. singular.
|
||||
| | |
|
||||
Supercon. conduct electric.
|
||||
|
||||
>>> print(tpp.text(maxwidth=8, nodedist=2))
|
||||
sentence
|
||||
_________|________
|
||||
| verb-
|
||||
| phrase
|
||||
| ________|_________
|
||||
plural- | noun-
|
||||
noun- | phrase
|
||||
phrase | |
|
||||
| | |
|
||||
plural- plural- singular-
|
||||
noun verb noun
|
||||
| | |
|
||||
Supercon conduct electric
|
||||
ductors ity
|
||||
|
||||
A discontinuous tree:
|
||||
|
||||
>>> tree = Tree.fromstring(
|
||||
... '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
|
||||
... '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
|
||||
... '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
|
||||
>>> sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
|
||||
... ' zwemmen of terrassen .'.split())
|
||||
>>> tpp = TreePrettyPrinter(tree, sentence)
|
||||
>>> print(tpp.text())
|
||||
top
|
||||
_____|______________________________________________
|
||||
smain | |
|
||||
_______________________________|_____ | |
|
||||
| | inf | |
|
||||
| | _____|____ | |
|
||||
| | | inf | |
|
||||
| | | ____|_____ | |
|
||||
| | | | conj | |
|
||||
| | _____ | ___ | _________|______ | __________________ |
|
||||
| | inf | | | | | | |
|
||||
| | _________|_____ | ___ | _________ | | | | |
|
||||
| | pp | | | | | | | |
|
||||
| | ____|____ | | | | | | | |
|
||||
| | | np | | | | inf | inf |
|
||||
| | | ____|____ | | | | | | | |
|
||||
noun verb prep det noun verb verb verb punct verb vg verb punct
|
||||
| | | | | | | | | | | | |
|
||||
Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen .
|
||||
|
||||
>>> print(tpp.text(unicodelines=True))
|
||||
top
|
||||
┌─────┴──────────────────┬───────────────────────────┐
|
||||
smain │ │
|
||||
┌────┬──────────────────────────┴─────┐ │ │
|
||||
│ │ inf │ │
|
||||
│ │ ┌─────┴────┐ │ │
|
||||
│ │ │ inf │ │
|
||||
│ │ │ ┌────┴─────┐ │ │
|
||||
│ │ │ │ conj │ │
|
||||
│ │ ┌───── │ ─── │ ─────────┴────── │ ─────┬─────┬──────┐ │
|
||||
│ │ inf │ │ │ │ │ │ │
|
||||
│ │ ┌─────────┴───── │ ─── │ ─────────┐ │ │ │ │ │
|
||||
│ │ pp │ │ │ │ │ │ │ │
|
||||
│ │ ┌────┴────┐ │ │ │ │ │ │ │ │
|
||||
│ │ │ np │ │ │ │ inf │ inf │
|
||||
│ │ │ ┌────┴────┐ │ │ │ │ │ │ │ │
|
||||
noun verb prep det noun verb verb verb punct verb vg verb punct
|
||||
│ │ │ │ │ │ │ │ │ │ │ │ │
|
||||
Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen .
|
||||
|
||||
Importing TreePrettyPrinter
|
||||
---------------------------
|
||||
|
||||
First of all, a simple tree will be constructed::
|
||||
|
||||
>>> from nltk.tree import Tree
|
||||
>>> tree = Tree.fromstring('(S (NP Mary) (VP walks))')
|
||||
|
||||
We'll use this sample tree to show that the method of importing `TreePrettyPrinter` work correctly:
|
||||
|
||||
- Recommended::
|
||||
|
||||
>>> from nltk.tree import TreePrettyPrinter
|
||||
>>> print(TreePrettyPrinter(tree).text())
|
||||
S
|
||||
____|____
|
||||
NP VP
|
||||
| |
|
||||
Mary walks
|
||||
|
||||
- Alternative but valid options::
|
||||
|
||||
>>> from nltk import TreePrettyPrinter
|
||||
>>> print(TreePrettyPrinter(tree).text())
|
||||
S
|
||||
____|____
|
||||
NP VP
|
||||
| |
|
||||
Mary walks
|
||||
|
||||
>>> from nltk.tree.prettyprinter import TreePrettyPrinter
|
||||
>>> print(TreePrettyPrinter(tree).text())
|
||||
S
|
||||
____|____
|
||||
NP VP
|
||||
| |
|
||||
Mary walks
|
||||
|
||||
- Deprecated, do not use::
|
||||
|
||||
>>> from nltk.treeprettyprinter import TreePrettyPrinter
|
||||
>>> print(TreePrettyPrinter(tree).text())
|
||||
S
|
||||
____|____
|
||||
NP VP
|
||||
| |
|
||||
Mary walks
|
||||
|
||||
This method will throw a DeprecationWarning::
|
||||
|
||||
Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead.
|
||||
@@ -0,0 +1,154 @@
|
||||
.. Copyright (C) 2001-2025 NLTK Project
|
||||
.. For license information, see LICENSE.TXT
|
||||
|
||||
-------------------------------------------
|
||||
Unit tests for the TreeTransformation class
|
||||
-------------------------------------------
|
||||
|
||||
>>> from copy import deepcopy
|
||||
>>> from nltk.tree import Tree, collapse_unary, chomsky_normal_form, un_chomsky_normal_form
|
||||
|
||||
>>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))"
|
||||
|
||||
>>> tree = Tree.fromstring(tree_string)
|
||||
>>> print(tree)
|
||||
(TOP
|
||||
(S
|
||||
(S
|
||||
(VP
|
||||
(VBN Turned)
|
||||
(ADVP (RB loose))
|
||||
(PP
|
||||
(IN in)
|
||||
(NP
|
||||
(NP (NNP Shane) (NNP Longman) (POS 's))
|
||||
(NN trading)
|
||||
(NN room)))))
|
||||
(, ,)
|
||||
(NP (DT the) (NN yuppie) (NNS dealers))
|
||||
(VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
|
||||
(. .)))
|
||||
|
||||
Make a copy of the original tree and collapse the subtrees with only one child
|
||||
|
||||
>>> collapsedTree = deepcopy(tree)
|
||||
>>> collapse_unary(collapsedTree)
|
||||
>>> print(collapsedTree)
|
||||
(TOP
|
||||
(S
|
||||
(S+VP
|
||||
(VBN Turned)
|
||||
(ADVP (RB loose))
|
||||
(PP
|
||||
(IN in)
|
||||
(NP
|
||||
(NP (NNP Shane) (NNP Longman) (POS 's))
|
||||
(NN trading)
|
||||
(NN room))))
|
||||
(, ,)
|
||||
(NP (DT the) (NN yuppie) (NNS dealers))
|
||||
(VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
|
||||
(. .)))
|
||||
|
||||
>>> collapsedTree2 = deepcopy(tree)
|
||||
>>> collapse_unary(collapsedTree2, collapsePOS=True, collapseRoot=True)
|
||||
>>> print(collapsedTree2)
|
||||
(TOP+S
|
||||
(S+VP
|
||||
(VBN Turned)
|
||||
(ADVP+RB loose)
|
||||
(PP
|
||||
(IN in)
|
||||
(NP
|
||||
(NP (NNP Shane) (NNP Longman) (POS 's))
|
||||
(NN trading)
|
||||
(NN room))))
|
||||
(, ,)
|
||||
(NP (DT the) (NN yuppie) (NNS dealers))
|
||||
(VP (AUX do) (NP (NP+RB little) (ADJP+RB right)))
|
||||
(. .))
|
||||
|
||||
Convert the tree to Chomsky Normal Form i.e. each subtree has either two
|
||||
subtree children or a single leaf value. This conversion can be performed
|
||||
using either left- or right-factoring.
|
||||
|
||||
>>> cnfTree = deepcopy(collapsedTree)
|
||||
>>> chomsky_normal_form(cnfTree, factor='left')
|
||||
>>> print(cnfTree)
|
||||
(TOP
|
||||
(S
|
||||
(S|<S+VP-,-NP-VP>
|
||||
(S|<S+VP-,-NP>
|
||||
(S|<S+VP-,>
|
||||
(S+VP
|
||||
(S+VP|<VBN-ADVP> (VBN Turned) (ADVP (RB loose)))
|
||||
(PP
|
||||
(IN in)
|
||||
(NP
|
||||
(NP|<NP-NN>
|
||||
(NP
|
||||
(NP|<NNP-NNP> (NNP Shane) (NNP Longman))
|
||||
(POS 's))
|
||||
(NN trading))
|
||||
(NN room))))
|
||||
(, ,))
|
||||
(NP (NP|<DT-NN> (DT the) (NN yuppie)) (NNS dealers)))
|
||||
(VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))))
|
||||
(. .)))
|
||||
|
||||
>>> cnfTree = deepcopy(collapsedTree)
|
||||
>>> chomsky_normal_form(cnfTree, factor='right')
|
||||
>>> print(cnfTree)
|
||||
(TOP
|
||||
(S
|
||||
(S+VP
|
||||
(VBN Turned)
|
||||
(S+VP|<ADVP-PP>
|
||||
(ADVP (RB loose))
|
||||
(PP
|
||||
(IN in)
|
||||
(NP
|
||||
(NP (NNP Shane) (NP|<NNP-POS> (NNP Longman) (POS 's)))
|
||||
(NP|<NN-NN> (NN trading) (NN room))))))
|
||||
(S|<,-NP-VP-.>
|
||||
(, ,)
|
||||
(S|<NP-VP-.>
|
||||
(NP (DT the) (NP|<NN-NNS> (NN yuppie) (NNS dealers)))
|
||||
(S|<VP-.>
|
||||
(VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
|
||||
(. .))))))
|
||||
|
||||
Employ some Markov smoothing to make the artificial node labels a bit more
|
||||
readable. See the treetransforms.py documentation for more details.
|
||||
|
||||
>>> markovTree = deepcopy(collapsedTree)
|
||||
>>> chomsky_normal_form(markovTree, horzMarkov=2, vertMarkov=1)
|
||||
>>> print(markovTree)
|
||||
(TOP
|
||||
(S^<TOP>
|
||||
(S+VP^<S>
|
||||
(VBN Turned)
|
||||
(S+VP|<ADVP-PP>^<S>
|
||||
(ADVP^<S+VP> (RB loose))
|
||||
(PP^<S+VP>
|
||||
(IN in)
|
||||
(NP^<PP>
|
||||
(NP^<NP>
|
||||
(NNP Shane)
|
||||
(NP|<NNP-POS>^<NP> (NNP Longman) (POS 's)))
|
||||
(NP|<NN-NN>^<PP> (NN trading) (NN room))))))
|
||||
(S|<,-NP>^<TOP>
|
||||
(, ,)
|
||||
(S|<NP-VP>^<TOP>
|
||||
(NP^<S> (DT the) (NP|<NN-NNS>^<S> (NN yuppie) (NNS dealers)))
|
||||
(S|<VP-.>^<TOP>
|
||||
(VP^<S>
|
||||
(AUX do)
|
||||
(NP^<VP> (NP^<NP> (RB little)) (ADJP^<NP> (RB right))))
|
||||
(. .))))))
|
||||
|
||||
Convert the transformed tree back to its original form
|
||||
|
||||
>>> un_chomsky_normal_form(markovTree)
|
||||
>>> tree == markovTree
|
||||
True
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user