NLTK09《Python自然语言处理》code08 分析句子结构

时间:2022-03-03 07:09:49

分析句子结构

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 08 分析句子结构
# pnlp08.py

# 8.1 一些语法困境
# 语言数据和无限可能性
# 普遍存在的歧义
import nltk
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
"""
)
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
trees = parser.parse(sent)
for tree in trees:
print(tree)
"""
(S
(NP I)
(VP
(VP (V shot) (NP (Det an) (N elephant)))
(PP (P in) (NP (Det my) (N pajamas)))))
(S
(NP I)
(VP
(V shot)
(NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))
"""


# 8.2 文法的用途

# 8.3 上下文无关文法
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my"
N -> "man" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
"""
)
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
print(tree)
"""(S (NP Mary) (VP (V saw) (NP Bob)))"""
# 编写自己的文法
grammar1 = nltk.data.load('file:mygrammar.cfg')
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
print(tree)
"""(S (NP Mary) (VP (V saw) (NP Bob)))"""

# 例8-2 递归的上下文无关文法
grammar2 = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det Nom | PropN
Nom -> Adj Nom | N
VP -> V Adj | V NP | V S | V NP PP
PP -> P NP
PropN -> 'Buster' | 'Chatterer' | 'Joe'
Det -> 'the' | 'a'
N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
Adj -> 'angry' | 'frightened' | 'little' | 'tall'
V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'
P -> 'on'
"""
)

# 8.4 上下文无关文法分析
# 递归下降解析
# 移进-归约分析
# 左角落解析器
# 符合句子规则的子串表
# 例8-3 使用符合语句规则的子串表接收器
def init_wfst(tokens, grammar):
numtokens = len(tokens)
wfst = [[None for i in range(numtokens+1)] for j in range(numtokens+1)]
for i in range(numtokens):
productions = grammar.productions(rhs=tokens[i])
wfst[i][i+1] = productions[0].lhs()
return wfst

def complete_wfst(wfst, tokens, grammar, trace=False):
index = dict((p.rhs(), p.lhs()) for p in grammar.productions())
numtokens = len(tokens)
for span in range(2, numtokens + 1):
for start in range(numtokens + 1):
end = start + span
if end > numtokens: break
for mid in range(start+1, end):
nt1, nt2 = wfst[start][mid], wfst[mid][end]
if nt1 and nt2 and (nt1, nt2) in index:
wfst[start][end] = index[(nt1, nt2)]
if trace:
print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]"
%(start, nt1, mid, nt2, end, start, index[(nt1, nt2)], end))
return wfst

def display(wfst, tokens):
print('\nWFST ' + ' '.join([("%-4d" % i) for i in range(1, len(wfst))]))
for i in range(len(wfst)-1):
print("%d " %i, end="")
for j in range(1, len(wfst)):
print("%-4s" % (wfst[i][j] or '.'), end="")
print("")

tokens = "I shot an elephant in my pajamas".split()
wfst0 = init_wfst(tokens, groucho_grammar)
display(wfst0, tokens)
"""
WFST 1 2 3 4 5 6 7
0 NP . . . . . .
1 . V . . . . .
2 . . Det . . . .
3 . . . N . . .
4 . . . . P . .
5 . . . . . Det .
6 . . . . . . N
"""


wfst1 = complete_wfst(wfst0, tokens, groucho_grammar)
display(wfst1, tokens)
"""
WFST 1 2 3 4 5 6 7
0 NP . . S . . S
1 . V . VP . . VP
2 . . Det NP . . .
3 . . . N . . .
4 . . . . P . PP
5 . . . . . Det NP
6 . . . . . . N
"""


wfst1 = complete_wfst(wfst0, tokens, groucho_grammar, trace=True)
"""
[2] Det [3] N [4] ==> [2] NP [4]
[5] Det [6] N [7] ==> [5] NP [7]
[1] V [2] NP [4] ==> [1] VP [4]
[4] P [5] NP [7] ==> [4] PP [7]
[0] NP [1] VP [4] ==> [0] S [4]
[1] VP [4] PP [7] ==> [1] VP [7]
[0] NP [1] VP [7] ==> [0] S [7]
"""


# 8.5 依存关系和依存文法
import nltk
# groucho_dep_grammar = nltk.parse_dependency_grammar(
groucho_dep_grammar = nltk.grammar.DependencyGrammar.fromstring("""
'shot' -> 'I' | 'elephant' | 'in'
'elephant' -> 'an' | 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
"""
)
print(groucho_dep_grammar)
"""
Dependency grammar with 7 productions
'shot' -> 'I'
'shot' -> 'elephant'
'shot' -> 'in'
'elephant' -> 'an'
'elephant' -> 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
"""

pdp = nltk.ProjectiveDependencyParser(groucho_dep_grammar)
sent = 'I shot an elephant in my pajamas'.split()
trees = pdp.parse(sent)
for tree in trees:
print(tree)
"""
(shot I (elephant an (in (pajamas my))))
(shot I (elephant an) (in (pajamas my)))
"""


# 配价与词汇

# 扩大规模

# 8.6 文法开发
# 树库和文法
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
print(t)
"""
(S
(NP-SBJ
(NP (NNP Pierre) (NNP Vinken))
(, ,)
(ADJP (NP (CD 61) (NNS years)) (JJ old))
(, ,))
(VP
(MD will)
(VP
(VB join)
(NP (DT the) (NN board))
(PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
(NP-TMP (NNP Nov.) (CD 29))))
(. .))
"""


# 例8-4 搜索树库找出句子的补语
def filter(tree):
child_nodes = [child.label() for child in tree if isinstance(child, nltk.Tree)]
return (tree.label() == 'VP') and ('S' in child_nodes)

from nltk.corpus import treebank
res = [subtree for tree in treebank.parsed_sents()
for subtree in tree.subtrees(filter)]
print(res)
"""[Tree('VP', [Tree('VBN', ['named']), ..."""

import nltk
entries = nltk.corpus.ppattach.attachments('training')
table = nltk.defaultdict(lambda: nltk.defaultdict(set))
for entry in entries:
key = entry.noun1 + '-' + entry.prep + '-' + entry.noun2
table[key][entry.attachment].add(entry.verb)

for key in sorted(table):
if len(table[key]) > 1:
print(key, 'N:', sorted(table[key]['N']), 'V:', sorted(table[key]['V']))
"""
%-below-level N: ['left'] V: ['be']
%-from-year N: ['was'] V: ['declined', 'dropped', 'fell', 'grew', 'increased', 'plunged', 'rose', 'was']
...
"""


nltk.corpus.sinica_treebank.parsed_sents()[3450].draw()

import nltk
# 有害的歧义
grammar = nltk.CFG.fromstring("""
S -> NP V NP
NP -> NP Sbar
Sbar -> NP V
NP -> 'fish'
V -> 'fish'
"""
)
tokens = ["fish"] * 5
cp = nltk.ChartParser(grammar)
for tree in cp.parse(tokens):
print(tree)
"""
(S (NP fish) (V fish) (NP (NP fish) (Sbar (NP fish) (V fish))))
(S (NP (NP fish) (Sbar (NP fish) (V fish))) (V fish) (NP fish))
"""


# 加权文法
# 例8-5 宾州树库样本中give和gave的用法
def give(t):
return (t.label() == 'VP' and len(t) > 2 and t[1].label() == 'NP'
and (t[2].label() == 'PP-DTV' or t[2].label() == 'NP')
and ('give' in t[0].leaves() or 'gave' in t[0].leaves()))

def sent(t):
return ' '.join(token for token in t.leaves() if token[0] not in '*-0')

def print_node(t, width):
output = "%s %s: %s / %s: %s" %\
(sent(t[0]), t[1].label(), sent(t[1]), t[2].label(), sent(t[2]))
if len(output) > width:
output = output[:width] + "..."
print(output)

for tree in nltk.corpus.treebank.parsed_sents():
for t in tree.subtrees(give):
print_node(t, 72)
"""
gave NP: the chefs / NP: a standing ovation
give NP: advertisers / NP: discounts for maintaining or increasing ad sp...
give NP: it / PP-DTV: to the politicians
gave NP: them / NP: similar help
give NP: them / NP:
give NP: only French history questions / PP-DTV: to students in a Europe...
give NP: federal judges / NP: a raise
give NP: consumers / NP: the straight scoop on the U.S. waste crisis
gave NP: Mitsui / NP: access to a high-tech medical product
give NP: Mitsubishi / NP: a window on the U.S. glass industry
give NP: much thought / PP-DTV: to the rates she was receiving , nor to ...
give NP: your Foster Savings Institution / NP: the gift of hope and free...
give NP: market operators / NP: the authority to suspend trading in futu...
gave NP: quick approval / PP-DTV: to $ 3.18 billion in supplemental appr...
give NP: the Transportation Department / NP: up to 50 days to review any...
give NP: the president / NP: such power
give NP: me / NP: the heebie-jeebies
give NP: holders / NP: the right , but not the obligation , to buy a cal...
gave NP: Mr. Thomas / NP: only a `` qualified '' rating , rather than ``...
give NP: the president / NP: line-item veto power
"""


# 概率上下文无关文法
# 例8-6 定义一个概率上下文无关文法(PCFG)
import nltk
grammar = nltk.PCFG.fromstring("""
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
"""
)
print(grammar)
"""
Grammar with 9 productions (start state = S)
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
"""

viterbi_parser = nltk.ViterbiParser(grammar)

trees = viterbi_parser.parse(['Jack', 'saw', 'telescopes'])
for tree in trees:
print(tree)
"""
(S (NP Jack) (VP (TV saw) (NP telescopes))) (p=0.064)
"""