NLTK10《Python自然语言处理》code09 建立基于特征的文法

时间:2022-03-24 07:19:16

建立基于特征的文法

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 09 建立基于特征的文法
# pnlp09.py

import nltk
# 9.1 文法特征
kim = {'CAT': 'NP', 'ORTH': 'Kim', 'REF': 'k'}
chase = {'CAT': 'V', 'ORTH': 'chased', 'REL': 'chase'}
# 对象kim、chase有一些共同特征,CAT(文法类别)、ORTH(正字法,即拼写)
# 具有面向语义的特征:kim['REF']表示kim的指示物,chase['REL']表示chase表示的关系
chase['AGT'] = 'sbj' # sbj:主语
chase['PAT'] = 'obj' # obj:宾语

sent = "Kim chased Lee"
tokens = sent.split()
lee = {'CAT': 'NP', 'ORTH': 'Lee', 'REF': 'l'}
def lex2fs(word):
for fs in [kim, lee, chase]:
if fs['ORTH'] ==word:
return fs
subj, verb, obj = lex2fs(tokens[0]), lex2fs(tokens[1]), lex2fs(tokens[2])
verb['AGT'] = subj['REF'] # agent of 'chase' is Kim
verb['PAT'] = obj['REF'] # patient of 'chase' is Lee
for k in ['ORTH', 'REL', 'AGT', 'PAT']: # check featstruct of 'chase'
print("%-5s => %s" % (k, verb[k]))
"""
ORTH => chased
REL => chase
AGT => k
PAT => l
"""


surprise = {'CAT': 'V', 'ORTH': 'surprised', 'REL': 'surprise', 'SRC': 'sbj', 'EXP': 'obj'}

# 句法协议

# 使用属性和约束

# 例9-1 基于特征的文法例子
nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg')
"""
% start S
# ###################
# Grammar Productions
# ###################
# S expansion productions
S -> NP[NUM=?n] VP[NUM=?n]
# NP expansion productions
NP[NUM=?n] -> N[NUM=?n]
NP[NUM=?n] -> PropN[NUM=?n]
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
NP[NUM=pl] -> N[NUM=pl]
# VP expansion productions
VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
# ###################
# Lexical Productions
# ###################
Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'
PropN[NUM=sg]-> 'Kim' | 'Jody'
N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children'
IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks'
TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk'
TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
IV[TENSE=past] -> 'disappeared' | 'walked'
TV[TENSE=past] -> 'saw' | 'liked'
"""


# 例9-2 跟踪基于特征的图表分析器
tokens = 'Kim likes children'.split()
from nltk import load_parser
cp = load_parser('grammars/book_grammars/feat0.fcfg', trace=2)
trees = cp.parse(tokens)
"""
|.Kim .like.chil.|
Leaf Init Rule:
|[----] . .| [0:1] 'Kim'
|. [----] .| [1:2] 'likes'
|. . [----]| [2:3] 'children'
Feature Bottom Up Predict Combine Rule:
|[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' *
Feature Bottom Up Predict Combine Rule:
|[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] *
Feature Bottom Up Predict Combine Rule:
|[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'}
Feature Bottom Up Predict Combine Rule:
|. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' *
Feature Bottom Up Predict Combine Rule:
|. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'}
Feature Bottom Up Predict Combine Rule:
|. . [----]| [2:3] N[NUM='pl'] -> 'children' *
Feature Bottom Up Predict Combine Rule:
|. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] *
Feature Bottom Up Predict Combine Rule:
|. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'}
Feature Single Edge Fundamental Rule:
|. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] *
Feature Single Edge Fundamental Rule:
|[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] *
"""


for tree in trees:print(tree)
"""
(S[]
(NP[NUM='sg'] (PropN[NUM='sg'] Kim))
(VP[NUM='sg', TENSE='pres']
(TV[NUM='sg', TENSE='pres'] likes)
(NP[NUM='pl'] (N[NUM='pl'] children))))
"""


# 术语

# 9.2 处理特征结构
fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')
print(fs1)
"""
[ NUM = 'sg' ]
[ TENSE = 'past' ]
"""


fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem')
print(fs1['GND']) # fem
fs1['CASE'] = 'acc'
fs2 = nltk.FeatStruct(POS='N', AGR=fs1)
print(fs2)
"""
[ [ CASE = 'acc' ] ]
[ AGR = [ GND = 'fem' ] ]
[ [ NUM = 'pl' ] ]
[ [ PER = 3 ] ]
[ ]
[ POS = 'N' ]
"""

print(fs2['AGR'])
"""
[ CASE = 'acc' ]
[ GND = 'fem' ]
[ NUM = 'pl' ]
[ PER = 3 ]
"""

print(fs2['AGR']['PER']) # 3


print(nltk.FeatStruct("[POS='N', AGR=[PER=3, NUM='pl', GND='fem']]"))
"""
[ [ GND = 'fem' ] ]
[ AGR = [ NUM = 'pl' ] ]
[ [ PER = 3 ] ]
[ ]
[ POS = 'N' ]
"""


print(nltk.FeatStruct(name='Lee', telno='01 27 86 42 96', age=33))
"""
[ age = 33 ]
[ name = 'Lee' ]
[ telno = '01 27 86 42 96' ]
"""


print(nltk.FeatStruct("""[NAME='Lee', ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
SPOUSE=[NAME='Kim', ADDRESS->(1)]]"""
))
"""
[ ADDRESS = (1) [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ SPOUSE = [ ADDRESS -> (1) ] ]
[ [ NAME = 'Kim' ] ]
"""


print(nltk.FeatStruct("[A='a', B=(1)[C='c'], D->(1), E->(1)]"))
"""
[ A = 'a' ]
[ ]
[ B = (1) [ C = 'c' ] ]
[ ]
[ D -> (1) ]
[ E -> (1) ]
"""


# 包含和统一
fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')
fs2 = nltk.FeatStruct(CITY='Paris')
print(fs1.unify(fs2))
"""
[ CITY = 'Paris' ]
[ NUMBER = 74 ]
[ STREET = 'rue Pascal' ]
"""

print(fs2.unify(fs1))
"""
[ CITY = 'Paris' ]
[ NUMBER = 74 ]
[ STREET = 'rue Pascal' ]
"""


fs0 = nltk.FeatStruct(A='a')
fs1 = nltk.FeatStruct(A='b')
fs2 = fs0.unify(fs1)
print(fs2) # None

fs0 = nltk.FeatStruct("""[NAME=Lee,
ADDRESS=[NUMBER=74,STREET='rue Pascal'],
SPOUSE=[NAME=Kim, ADDRESS=[number=74,STREET='rue Pascal']]]"""
)
print(fs0)
"""
[ ADDRESS = [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ [ ADDRESS = [ STREET = 'rue Pascal' ] ] ]
[ SPOUSE = [ [ number = 74 ] ] ]
[ [ ] ]
[ [ NAME = 'Kim' ] ]
"""


fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]")
print(fs1.unify(fs0))
"""
[ ADDRESS = [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ [ [ CITY = 'Paris' ] ] ]
[ [ ADDRESS = [ STREET = 'rue Pascal' ] ] ]
[ SPOUSE = [ [ number = 74 ] ] ]
[ [ ] ]
[ [ NAME = 'Kim' ] ]
"""


fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'],
SPOUSE=[NAME=Kim, ADDRESS->(1)]]"""
)
print(fs1.unify(fs2))
"""
[ ADDRESS = (1) [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ SPOUSE = [ ADDRESS -> (1) ] ]
[ [ NAME = 'Kim' ] ]
"""


fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]")
fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]")
print(fs2)
"""
[ ADDRESS1 = ?x ]
[ ADDRESS2 = ?x ]
"""

print(fs2.unify(fs1))
"""
[ ADDRESS1 = (1) [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ ADDRESS2 -> (1) ]
"""


# 9.3 扩展基于特征的文法
# 子类别

# 核心词

# 助动词和倒装

# 无限制依赖成分

# 例9-3 具有倒装从句和长距离依赖的产生式的文法,使用斜线类别
nltk.data.show_cfg('grammars/book_grammars/feat1.fcfg')
"""
% start S
# ###################
# Grammar Productions
# ###################
S[-INV] -> NP VP
S[-INV]/?x -> NP VP/?x
S[-INV] -> NP S/NP
S[-INV] -> Adv[+NEG] S[+INV]
S[+INV] -> V[+AUX] NP VP
S[+INV]/?x -> V[+AUX] NP VP/?x
SBar -> Comp S[-INV]
SBar/?x -> Comp S[-INV]/?x
VP -> V[SUBCAT=intrans, -AUX]
VP -> V[SUBCAT=trans, -AUX] NP
VP/?x -> V[SUBCAT=trans, -AUX] NP/?x
VP -> V[SUBCAT=clause, -AUX] SBar
VP/?x -> V[SUBCAT=clause, -AUX] SBar/?x
VP -> V[+AUX] VP
VP/?x -> V[+AUX] VP/?x
# ###################
# Lexical Productions
# ###################
V[SUBCAT=intrans, -AUX] -> 'walk' | 'sing'
V[SUBCAT=trans, -AUX] -> 'see' | 'like'
V[SUBCAT=clause, -AUX] -> 'say' | 'claim'
V[+AUX] -> 'do' | 'can'
NP[-WH] -> 'you' | 'cats'
NP[+WH] -> 'who'
Adv[+NEG] -> 'rarely' | 'never'
NP/NP ->
Comp -> 'that'
"""

tokens = 'who do you claim that you like'.split()
from nltk import load_parser
cp = load_parser('grammars/book_grammars/feat1.fcfg')
for tree in cp.parse(tokens):
print(tree)
"""
(S[-INV]
(NP[+WH] who)
(S[+INV]/NP[]
(V[+AUX] do)
(NP[-WH] you)
(VP[]/NP[]
(V[-AUX, SUBCAT='clause'] claim)
(SBar[]/NP[]
(Comp[] that)
(S[-INV]/NP[]
(NP[-WH] you)
(VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] )))))))
"""


tokens = 'you claim that you like cats'.split()
for tree in cp.parse(tokens):
print(tree)
"""
(S[-INV]
(NP[-WH] you)
(VP[]
(V[-AUX, SUBCAT='clause'] claim)
(SBar[]
(Comp[] that)
(S[-INV]
(NP[-WH] you)
(VP[] (V[-AUX, SUBCAT='trans'] like) (NP[-WH] cats))))))
"""


tokens = 'rarely do you sing'.split()
for tree in cp.parse(tokens):
print(tree)
"""
(S[-INV]
(Adv[+NEG] rarely)
(S[+INV]
(V[+AUX] do)
(NP[-WH] you)
(VP[] (V[-AUX, SUBCAT='intrans'] sing))))
"""


# 例9-4 基于特征的文法的例子
nltk.data.show_cfg('grammars/book_grammars/german.fcfg')
"""
% start S
# Grammar Productions
S -> NP[CASE=nom, AGR=?a] VP[AGR=?a]
NP[CASE=?c, AGR=?a] -> PRO[CASE=?c, AGR=?a]
NP[CASE=?c, AGR=?a] -> Det[CASE=?c, AGR=?a] N[CASE=?c, AGR=?a]
...
"""

tokens = 'ich folge den Katzen'.split()
cp = nltk.load_parser('grammars/book_grammars/german.fcfg')
for tree in cp.parse(tokens):
print(tree)
"""
(S[]
(NP[AGR=[NUM='sg', PER=1], CASE='nom']
(PRO[AGR=[NUM='sg', PER=1], CASE='nom'] ich))
(VP[AGR=[NUM='sg', PER=1]]
(TV[AGR=[NUM='sg', PER=1], OBJCASE='dat'] folge)
(NP[AGR=[GND='fem', NUM='pl', PER=3], CASE='dat']
(Det[AGR=[NUM='pl', PER=3], CASE='dat'] den)
(N[AGR=[GND='fem', NUM='pl', PER=3]] Katzen))))
"""


tokens = 'ich folge den Katze'.split()
cp = nltk.load_parser('grammars/book_grammars/german.fcfg', trace=2)
for tree in cp.parse(tokens):
print(tree)
"""
|.ich.fol.den.Kat.|
Leaf Init Rule:
|[---] . . .| [0:1] 'ich'
|. [---] . .| [1:2] 'folge'
|. . [---] .| [2:3] 'den'
|. . . [---]| [3:4] 'Katze'
Feature Bottom Up Predict Combine Rule:
|[---] . . .| [0:1] PRO[AGR=[NUM='sg', PER=1], CASE='nom'] -> 'ich' *
Feature Bottom Up Predict Combine Rule:
|[---] . . .| [0:1] NP[AGR=[NUM='sg', PER=1], CASE='nom'] -> PRO[AGR=[NUM='sg', PER=1], CASE='nom'] *
Feature Bottom Up Predict Combine Rule:
|[---> . . .| [0:1] S[] -> NP[AGR=?a, CASE='nom'] * VP[AGR=?a] {?a: [NUM='sg', PER=1]}
Feature Bottom Up Predict Combine Rule:
|. [---] . .| [1:2] TV[AGR=[NUM='sg', PER=1], OBJCASE='dat'] -> 'folge' *
Feature Bottom Up Predict Combine Rule:
|. [---> . .| [1:2] VP[AGR=?a] -> TV[AGR=?a, OBJCASE=?c] * NP[CASE=?c] {?a: [NUM='sg', PER=1], ?c: 'dat'}
Feature Bottom Up Predict Combine Rule:
|. . [---] .| [2:3] Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] -> 'den' *
|. . [---] .| [2:3] Det[AGR=[NUM='pl', PER=3], CASE='dat'] -> 'den' *
Feature Bottom Up Predict Combine Rule:
|. . [---> .| [2:3] NP[AGR=?a, CASE=?c] -> Det[AGR=?a, CASE=?c] * N[AGR=?a, CASE=?c] {?a: [NUM='pl', PER=3], ?c: 'dat'}
Feature Bottom Up Predict Combine Rule:
|. . [---> .| [2:3] NP[AGR=?a, CASE=?c] -> Det[AGR=?a, CASE=?c] * N[AGR=?a, CASE=?c] {?a: [GND='masc', NUM='sg', PER=3], ?c: 'acc'}
Feature Bottom Up Predict Combine Rule:
|. . . [---]| [3:4] N[AGR=[GND='fem', NUM='sg', PER=3]] -> 'Katze' *
"""