精通Python自然语言处理

import nltk
text=" Welcome readers. I hope you find it interesting. Please do reply."
from nltk.tokenize import sent_tokenize
print(sent_tokenize(text))

import nltk
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
text=" Hello everyone. Hope all are fine and doing well. Hope you find the book interesting"
print(tokenizer.tokenize(text))

import nltk
french_tokenizer=nltk.data.load('tokenizers/punkt/french.pickle')
print(french_tokenizer.tokenize('Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage collège franco-britanniquedeLevallois-Perret. Deux agressions en quelques jours, voilà ce qui a motivé hier matin le débrayage Levallois. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, janvier , d’un professeur d’histoire. L’équipe pédagogique de ce collège de 750 élèves avait déjà été choquée par l’agression, mercredi , d’un professeur d’histoire'))

import nltk
text = nltk.word_tokenize("PierreVinken , 59 years old , will join as a nonexecutive director on Nov. 29 .")
print(text)

import nltk
from nltk import word_tokenize
r=input("Please write a text")
print("The length of text is",len(word_tokenize(r)),"words")

import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))

import nltk
text=nltk.word_tokenize(" Don't hesitate to ask questions")
print(text)

from nltk.tokenize import WordPunctTokenizer
tokenizer=WordPunctTokenizer()
print(tokenizer.tokenize(" Don't hesitate to ask questions"))

import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer("[\w']+")
print(tokenizer.tokenize("Don't hesitate to ask questions"))

import nltk
from nltk.tokenize import regexp_tokenize
sent="Don't hesitate to ask questions"
print(regexp_tokenize(sent, pattern='\w+|\$[\d\.]+|\S+'))

import nltk
from nltk.tokenize import RegexpTokenizer
tokenizer=RegexpTokenizer('\s+',gaps=True)
print(tokenizer.tokenize("Don't hesitate to ask questions"))

import nltk
from nltk.tokenize import RegexpTokenizer
sent=" She secured 90.56 % in class X . She is a meritorious student"
capt = RegexpTokenizer('[A-Z]\w+')
print(capt.tokenize(sent))

import nltk
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import BlanklineTokenizer
print(BlanklineTokenizer().tokenize(sent))

import nltk
sent=" She secured 90.56 % in class X . She is a meritorious student"
from nltk.tokenize import WhitespaceTokenizer
print(WhitespaceTokenizer().tokenize(sent))

import nltk
sent= "She secured 90.56 % in class X. She is a meritorious student"
print(sent.split())
print(sent.split(' '))
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(sent.split('\n'))

import nltk
from nltk.tokenize import BlanklineTokenizer
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(BlanklineTokenizer().tokenize(sent))
from nltk.tokenize import LineTokenizer
print(LineTokenizer(blanklines='keep').tokenize(sent))
print(LineTokenizer(blanklines='discard').tokenize(sent))

import nltk
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
from nltk.tokenize import SpaceTokenizer
print(SpaceTokenizer().tokenize(sent))

import nltk
from nltk.tokenize import WhitespaceTokenizer
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(WhitespaceTokenizer().span_tokenize(sent)))

import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.util import spans_to_relative
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(spans_to_relative(WhitespaceTokenizer().span_tokenize(sent))))

import nltk
from nltk.tokenize.util import string_span_tokenize
sent=" She secured 90.56 % in class X \n. She is a meritorious student\n"
print(list(string_span_tokenize(sent, " ")))

text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs=[word_tokenize(doc) for doc in text]
print(tokenized_docs)

import re
import string
text=[" It is a pleasant evening.","Guests, who came from US arrived at the venue","Food was tasty."]
from nltk.tokenize import word_tokenize
tokenized_docs=[word_tokenize(doc) for doc in text]
x=re.compile('[%s]' % re.escape(string.punctuation)) # 注意此处用法, re.escape内部是函数
tokenized_docs_no_punctuation = []
for review in tokenized_docs:
new_review = []
for token in review:
new_token = x.sub(u'', token) # 此处 re.sub
if not new_token == u'':
new_review.append(new_token)
tokenized_docs_no_punctuation.append(new_review)
print(tokenized_docs_no_punctuation)

text='HARdWork IS KEy to SUCCESS'
print(text.lower())
print(text.upper())

import nltk
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))
words=["Don't", 'hesitate','to','ask','questions']
print([word for word in words if word not in stops])

from nltk.corpus import stopwords
print(stopwords.fileids())

import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
def para_fraction(text):
stopwords = nltk.corpus.stopwords.words('english')
para = [w for w in text if w.lower() not in stopwords]
return len(para) / len(text)
print(para_fraction(nltk.corpus.reuters.words()))
print(para_fraction(nltk.corpus.inaugural.words()))

import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]
class RegexpReplacer(object):
def __init__(self, patterns=replacement_patterns):
self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s

import nltk
from replacers import RegexpReplacer
replacer= RegexpReplacer()
replacer.replace("Don't hesitate to ask questions")
print(replacer.replace("She must've gone to the market but she didn't go"))

import nltk
from nltk.tokenize import word_tokenize
from replacers import RegexpReplacer
replacer=RegexpReplacer()
word_tokenize("Don't hesitate to ask questions")
print(word_tokenize(replacer.replace("Don't hesitate to ask questions")))

replacers.py
class RepeatReplacer(object):
def __init__(self):
self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
repl_word = self.repeat_regexp.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word

import nltk
from replacers import RepeatReplacer
replacer=RepeatReplacer()
print(replacer.replace('lotttt'))
print(replacer.replace('ohhhhh'))
print(replacer.replace('ooohhhhh'))

import re
from nltk.corpus import wordnet
class RepeatReplacer(object):
def __init__(self):
self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
self.repl = r'\1\2\3'
def replace(self, word):
if wordnet.synsets(word):
return word
repl_word = self.repeat_regexp.sub(self.repl, word)
if repl_word != word:
return self.replace(repl_word)
else:
return repl_word

import nltk
from replacers import RepeatReplacer
replacer=RepeatReplacer()
print(replacer.replace('happy'))

class WordReplacer(object):
def __init__(self, word_map):
self.word_map = word_map
def replace(self, word):
return self.word_map.get(word, word)

import nltk
from replacers import WordReplacer
replacer=WordReplacer({'congrats':'congratulations'})
print(replacer.replace('congrats'))
print(replacer.replace('maths'))

import nltk
from nltk.corpus import gutenberg
from nltk.probability import FreqDist
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('TkAgg')
fd = FreqDist()
for text in gutenberg.fileids():
for word in gutenberg.words(text):
fd.inc(word)
ranks = []
freqs = []
for rank, word in enumerate(fd):
ranks.append(rank+1)
freqs.append(fd[word])

plt.loglog(ranks, freqs)
plt.xlabel('frequency(f)', fontsize=14, fontweight='bold')
plt.ylabel('rank(r)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()

from __future__ import print_function
from nltk.metrics import *
training='PERSON OTHER PERSON OTHER OTHER ORGANIZATION'.split()
testing='PERSON OTHER OTHER OTHER OTHER OTHER'.split()
print(accuracy(training,testing))
trainset=set(training)
testset=set(testing)
precision(trainset,testset)
print(recall(trainset,testset))
print(f_measure(trainset,testset))

from __future__ import print_function
def _edit_dist_init(len1, len2):
lev = []
for i in range(len1):
lev.append([0] * len2) # initialize 2D array to zero
for i in range(len1):
lev[i][0] = i # column 0: 0,1,2,3,4,...
for j in range(len2):
lev[0][j] = j # row 0: 0,1,2,3,4,...
return lev
def _edit_dist_step(lev,i,j,s1,s2,transpositions=False):
c1 =s1[i-1]
c2 =s2[j-1]
# skipping a character in s1
a =lev[i-1][j] +1
# skipping a character in s2
b =lev[i][j -1]+1
# substitution
c =lev[i-1][j-1]+(c1!=c2)
# transposition
d =c+1 # never picked by default
if transpositions and i>1 and j>1:
if s1[i -2]==c2 and s2[j -2]==c1:
d =lev[i-2][j-2]+1
# pick the cheapest
lev[i][j] =min(a,b,c,d)
def edit_distance(s1, s2, transpositions=False):
# set up a 2-D array
len1 = len(s1)
len2 = len(s2)
lev = _edit_dist_init(len1 + 1, len2 + 1)
# iterate over the array
for i in range(len1):
for j in range(len2):
_edit_dist_step(lev, i + 1, j + 1, s1, s2,transpositions=transpositions)
return lev[len1][len2]

import nltk
from nltk.metrics import *
print(edit_distance("relate","relation"))
print(edit_distance("suggestion","calculation"))

def jacc_similarity(query, document):
first=set(query).intersection(set(document))
second=set(query).union(set(document))
return len(first)/len(second)

import nltk
from nltk.metrics import *
X=set([10,20,30,40])
Y=set([20,30,60])
print(jaccard_distance(X,Y))

import nltk
from nltk.metrics import *
X = set([10,20,30,40])
Y= set([30,50,70])
print(binary_distance(X, Y))

def masi_distance(label1, label2):
len_intersection = len(label1.intersection(label2))
len_union = len(label1.union(label2))
len_label1 = len(label1)
len_label2 = len(label2)
if len_label1 == len_label2 and len_label1 == len_intersection:
m = 1
elif len_intersection == min(len_label1, len_label2):
m = 0.67
elif len_intersection > 0:
m = 0.33
else:
m = 0
return 1 - (len_intersection / float(len_union)) * m

import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
trigrams_tokens=ngrams(alpino.words(),3)
for i in trigrams_tokens:
print(i)

import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
unigrams=ngrams(alpino.words(),1)
for i in unigrams:
print(i)

import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
quadgrams=ngrams(alpino.words(),4)
for i in quadgrams:
print(i)

import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
tokens=[t.lower() for t in webtext.words('grail.txt')]
words=BigramCollocationFinder.from_words(tokens)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))

import nltk
from nltk.collocations import *
text1="Hardwork is the key to success. Never give up!"
word = nltk.wordpunct_tokenize(text1)
finder = BigramCollocationFinder.from_words(word)
bigram_measures = nltk.collocations.BigramAssocMeasures()
value = finder.score_ngrams(bigram_measures.raw_freq)
print(sorted(bigram for bigram, score in value))

import nltk
from nltk.util import ngrams
from nltk.corpus import alpino
print(alpino.words())
bigrams_tokens=ngrams(alpino.words(),2)
for i in bigrams_tokens:
print(i)

import nltk
from nltk.collocations import *
import nltk
text="Hello how are you doing ? I hope you find the book interesting"
tokens=nltk.wordpunct_tokenize(text)
fourgrams=nltk.collocations.QuadgramCollocationFinder.from_words(tokens)
for fourgram, freq in fourgrams.ngram_fd.items():
print(fourgram,freq)

import nltk
from nltk.util import ngrams
sent=" Hello , please read the book thoroughly . If you have any queries , then don't hesitate to ask . There is no shortcut to success ."
n=5
fivegrams=ngrams(sent.split(),n)
for grams in fivegrams:
print(grams)

from __future__ import print_function,unicode_literals
__docformat__='epytext en'
try:
import numpy
except ImportError:
import tempfile
import os
from collections import defaultdict
from nltk import compat
from nltk.data import gzip_open_unicode
from nltk.util import OrderedDict
from nltk.probability import DictionaryProbDist
from nltk.classify.api import ClassifierI
from nltk.classify.util import CutoffChecker,accuracy,log_likelihood
from nltk.classify.megam import (call_megam,write_megam_file,parse_megam_weights)
from nltk.classify.tadm import call_tadm,write_tadm_file,parse_tadm_weig

class MLEProbDist(ProbDistI):
def __init__(self, freqdist, bins=None):
self._freqdist = freqdist
def freqdist(self):
"""
此函数将在概率分布的基础上找到频率分布：
"""
return self._freqdist
def prob(self, sample):
return self._freqdist.freq(sample)
def max(self):
return self._freqdist.max()
def samples(self):
return self._freqdist.keys()
def __repr__(self):
"""
It will return string representation of ProbDist
"""
return '<MLEProbDist based on %d samples>' % self._
freqdist.N()
class LidstoneProbDist(ProbDistI):
"""
该类用于获取频率分布。该频率分布由实数 Gamma 表示，其取值范围在 0 到 1 之间。
LidstoneProbDist 使用计数 c、样本结果 N 和能够从概率分布中获取的样本值 B 来计
算给定样本概率的公式如下： (c+Gamma)/(N+B*Gamma)。
这也意味着将 Gamma 加到了每一个可能的样本结果的计数上，并且从给定的频率分
布中计算出了 MLE：
"""
SUM_TO_ONE = False
def __init__(self, freqdist, gamma, bins=None):
"""
Lidstone 用于计算概率分布以便获取 freqdist。
参数 freqdist 可以定义为概率估计所基于的频率分布。
参数 bins 可以被定义为能够从概率分布中获取的样本值，概率的总和等于 1：
"""
if (bins == 0) or (bins is None and freqdist.N() == 0):
name = self.__class__.__name__[:-8]
raise ValueError('A %s probability distribution ' % name +
'must have at least one bin.')
if (bins is not None) and (bins < freqdist.B()):
name = self.__class__.__name__[:-8]
raise ValueError('\nThe number of bins in a %s
distribution ' % name +
'(%d) must be greater than or equal to\n' % bins +
'the number of bins in the FreqDist used ' +
'to create it (%d).' % freqdist.B())
self._freqdist = freqdist
self._gamma = float(gamma)
self._N = self._freqdist.N()
if bins is None:
bins = freqdist.B()
self._bins = bins
self._divisor = self._N + bins * gamma
if self._divisor == 0.0:
# In extreme cases we force the probability to be 0,
# which it will be, since the count will be 0:
self._gamma = 0
self._divisor = 1
def freqdist(self):
"""
该函数基于概率分布获取了频率分布：
"""
return self._freqdist
def prob(self, sample):
c = self._freqdist[sample]
return (c + self._gamma) / self._divisor
def max(self):
# To obtain most probable sample, choose the one
# that occurs very frequently.
return self._freqdist.max()
def samples(self):
return self._freqdist.keys()
def discount(self):
gb = self._gamma * self._bins
return gb / (self._N + gb)
def __repr__(self):
"""
String representation of ProbDist is obtained.2.1 理解单词频率 29
"""
return '<LidstoneProbDist based on %d samples>' % self._
freqdist.N()
class LaplaceProbDist(LidstoneProbDist):
"""
该类用于获取频率分布。它使用计数 c、样本结果 N 和能够被生成的样本值的频率 B
来计算一个样本的概率，计算公式如下：
(c+1)/(N+B)
这也意味着将 1 加到了每一个可能的样本结果的计数上，并且获取了所得频率分布的
最大似然估计：
"""
def __init__(self, freqdist, bins=None):
"""
LaplaceProbDist 用于获取为生成 freqdist 的概率分布。
参数 freqdist 用于获取基于概率估计的频率分布。
参数 bins 可以被认为是能够被生成的样本值的频率。概率的总和必须为 1：
"""
LidstoneProbDist.__init__(self, freqdist, 1, bins)
def __repr__(self):
"""
String representation of ProbDist is obtained.
"""
return '<LaplaceProbDist based on %d samples>' % self._
freqdist.N()
class ELEProbDist(LidstoneProbDist):
"""
该类用于获取频率分布。它使用计数 c，样本结果 N 和能够被生成的样本值的频率 B
来计算一个样本的概率，计算公式如下：30 第 2 章统计语言建模
(c+0.5)/(N+B/2)
这也意味着将 0.5 加到了每一个可能的样本结果的计数上，并且获取了所得频率分布
的最大似然估计：
"""
def __init__(self, freqdist, bins=None):
"""
预期似然估计用于获取生成 freqdist 的概率分布。参数 freqdist 用于获取基于概
率估计的频率分布。
参数 bins 可以被认为是能够被生成的样本值的频率。概率的总和必须为 1：
"""
LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
def __repr__(self):
"""
String representation of ProbDist is obtained.
"""
return '<ELEProbDist based on %d samples>' % self._
freqdist.N()
class WittenBellProbDist(ProbDistI):
"""
WittenBellProbDist 类用于获取概率分布。在之前看到的样本频率的基础上，该
类用于获取均匀的概率质量。关于样本概率质量的计算公式如下：
T / (N + T)
这里， T 是观察到的样本数， N 是观察到的事件的总数。样本的概率质量等于即将出
现的新样本的最大似然估计。所有概率的总和等于 1：
Here,
p = T / Z (N + T), if count = 0
p = c / (N + T), otherwise
"""
def __init__(self, freqdist, bins=None):
"""2.1 理解单词频率 31
此段代码获取了概率分布。该概率用于向未知的样本提供均匀的概率质量。样本的概
率质量计算公式给出如下：
T / (N + T)
这里， T 是观察到的样本数， N 是观察到的事件的总数。样本的概率质量等于即将出
现的新样本的最大似然估计。所有概率的总和等于 1：
Here,
p = T / Z (N + T), if count = 0
p = c / (N + T), otherwise
Z 是使用这些值和一个 bin 值计算出的规范化因子。
参数 freqdist 用于估算可以从中获取概率分布的频率计数。
参数 bins 可以定义为样本的可能类型的数量：
"""
assert bins is None or bins >= freqdist.B(),\
'bins parameter must not be less than %d=freqdist.B()' % freqdist.B()
if bins is None:
bins = freqdist.B()
self._freqdist = freqdist
self._T = self._freqdist.B()
self._Z = bins - self._freqdist.B()
self._N = self._freqdist.N()
# self._P0 is P(0), precalculated for efficiency:
if self._N==0:
# if freqdist is empty, we approximate P(0) by a
UniformProbDist:
self._P0 = 1.0 / self._Z
else:
self._P0 = self._T / float(self._Z * (self._N + self._T))
def prob(self, sample):
# inherit docs from ProbDistI
c = self._freqdist[sample]
return (c / float(self._N + self._T) if c != 0 else self._P0)
def max(self):
return self._freqdist.max()
def samples(self):
return self._freqdist.keys()
def freqdist(self):
return self._freqdist
def discount(self):
raise NotImplementedError()
def __repr__(self):
"""
String representation of ProbDist is obtained.
"""
return '<WittenBellProbDist based on %d samples>' % self._
freqdist.N()

import nltk
cor = nltk.corpus.brown.tagged_sents(categories='adventure')[:500]
print(len(cor))
from nltk.util import unique_list
tag_set = unique_list(tag for sent in cor for (word,tag) in sent)
print(len(tag_set))
symbols = unique_list(word for sent in cor for (word,tag) in sent)
print(len(symbols))
print(len(tag_set))
symbols = unique_list(word for sent in cor for (word,tag) in sent)
print(len(symbols))
trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
train_corpus = []
test_corpus = []
for i in range(len(cor)):
if i % 10:
train_corpus+=[cor[i]]
else:
test_corpus+=[cor[i]]
print(len(train_corpus))
print(len(test_corpus))

import nltk
corpus=u"<s> hello how are you doing ? Hope you find the book interesting. </s>".split()
sentence=u"<s>how are you doing</s>".split()
vocabulary=set(corpus)
print(len(vocabulary))
cfd = nltk.ConditionalFreqDist(nltk.bigrams(corpus))
print([cfd[a][b] for (a,b) in nltk.bigrams(sentence)])
print([cfd[a].N() for (a,b) in nltk.bigrams(sentence)])
print([cfd[a].freq(b) for (a,b) in nltk.bigrams(sentence)])
print([1 + cfd[a][b] for (a,b) in nltk.bigrams(sentence)])
print([len(vocabulary) + cfd[a].N() for (a,b) in nltk.bigrams(sentence)])
print([1.0 * (1+cfd[a][b]) / (len(vocabulary)+cfd[a].N()) for (a,b) in nltk.bigrams(sentence)])
cpd_mle = nltk.ConditionalProbDist(cfd, nltk.MLEProbDist, bins=len(vocabulary))
print([cpd_mle[a].prob(b) for (a,b) in nltk.bigrams(sentence)])
cpd_laplace = nltk.ConditionalProbDist(cfd, nltk.LaplaceProbDist, bins=len(vocabulary))
print([cpd_laplace[a].prob(b) for (a,b) in nltk.bigrams(sentence)])

import nltk
from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()
print(stemmerporter.stem('working'))
print(stemmerporter.stem('happiness'))

import nltk
from nltk.stem import LancasterStemmer
stemmerlan=LancasterStemmer()
print(stemmerlan.stem('working'))
print(stemmerlan.stem('happiness'))

import nltk
from nltk.stem import RegexpStemmer
stemmerregexp=RegexpStemmer('ing')
print(stemmerregexp.stem('working'))
print(stemmerregexp.stem('happiness'))
print(stemmerregexp.stem('pairing'))

import nltk
from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)
spanishstemmer=SnowballStemmer('spanish')
print(spanishstemmer.stem('comiendo'))
frenchstemmer=SnowballStemmer('french')
print(frenchstemmer.stem('manger'))

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
def obtain_tokens():
With open('/home/p/NLTK/sample1.txt') as stem: tok = nltk.word_
tokenize(stem.read())
return tokens
def stemming(filtered):
stem=[]
for x in filtered:
stem.append(PorterStemmer().stem(x))
return stem
if_name_=="_main_":
tok= obtain_tokens()
print("tokens is %s")%(tok)
stem_tokens= stemming(tok)
print("After stemming is %s")%stem_tokens
res=dict(zip(tok,stem_tokens))
print("{tok:stemmed}=%s")%(result)

import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer_output=WordNetLemmatizer()
print(lemmatizer_output.lemmatize('working'))
print(lemmatizer_output.lemmatize('working',pos='v'))
print(lemmatizer_output.lemmatize('works'))

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer_output=PorterStemmer()
print(stemmer_output.stem('happiness'))
lemmatizer_output=WordNetLemmatizer()
print(lemmatizer_output.lemmatize('happiness'))

from polyglot.downloader import downloader
print(downloader.supported_languages_table("morph2"))

>>> import enchant
>>> s = enchant.Dict("en_US")
>>> tok=[]
>>> def tokenize(st1):
if not st1:return
for j in xrange(len(st1),-1,-1):
if s.check(st1[0:j]):
tok.append(st1[0:i])
st1=st[j:]
tokenize(st1)
break
>>> tokenize("itismyfavouritebook")
>>> tok
['it', 'is', 'my','favourite','book']
>>> tok=[ ]
>>> tokenize("ihopeyoufindthebookinteresting")
>>> tok
['i','hope','you','find','the','book','interesting']

import nltk
text1=nltk.word_tokenize("It is a pleasant day today")
print(nltk.pos_tag(text1))

nltk.help.upenn_tagset('NNS')
nltk.help.upenn_tagset('VB.*')

import nltk
text=nltk.word_tokenize("I cannot bear the pain of bear")
print(nltk.pos_tag(text))

import nltk
taggedword=nltk.tag.str2tuple('bear/NN')
print(taggedword)
print(taggedword[0])
print(taggedword[1])

import nltk
sentence='''The/DT sacred/VBN Ganga/NNP flows/VBZ in/IN this/DT region/NN ./. This/DT is/VBZ a/DT pilgrimage/NN ./. People/NNP from/IN all/DT over/IN the/DT country/NN visit/NN this/DT place/NN ./. '''
print([nltk.tag.str2tuple(t) for t in sentence.split()])

import nltk
taggedtok = ('bear', 'NN')
from nltk.tag.util import tuple2str
print(tuple2str(taggedtok))

import nltk
from nltk.corpus import treebank
treebank_tagged = treebank.tagged_words(tagset='universal')
tag = nltk.FreqDist(tag for (word, tag) in treebank_tagged)
print(tag.most_common())

import nltk
from nltk.corpus import treebank
treebank_tagged = treebank.tagged_words(tagset='universal')
tagpairs = nltk.bigrams(treebank_tagged)
preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN']
freqdist = nltk.FreqDist(preceders_noun)
print([tag for (tag, _) in freqdist.most_common()])

import nltk
tag={}
print(tag)
tag['beautiful']='ADJ'

tag['boy']='N'
tag['read']='V'
tag['generously']='ADV'
print(tag)

import nltk
from nltk.tag import DefaultTagger
tag = DefaultTagger('NN')
print(tag.tag(['Beautiful', 'morning']))

import nltk
from nltk.tag import untag
print(untag([('beautiful', 'NN'), ('morning', 'NN')]))

import nltk
import os,os.path
create = os.path.expanduser('~/nltkdoc')
if not os.path.exists(create):
os.mkdir(create)
print(os.path.exists(create))
import nltk.data
print(create in nltk.data.path)

import nltk.data
nltk.data.load('nltkcorpora/important/firstdoc.txt',format='raw')

import nltk
from nltk.corpus import names
print(len(names.words('male.txt')))
print(len(names.words('female.txt')))
names.fileids()

import nltk
from nltk.corpus import words
print(words.fileids())
print(len(words.words('en')))
print(len(words.words('en-basic')))

import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
print(treebank.sents()[0])
print(unitagger.tag(treebank.sents()[0]))

import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
testing = treebank.tagged_sents()[2000:]
print(unitagger.evaluate(testing))

import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
unitag = UnigramTagger(model={'Vinken': 'NN'})
print(unitag.tag(treebank.sents()[0]))

import nltk
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tag1=DefaultTagger('NN')
tag2=UnigramTagger(training,backoff=tag1)
print(tag2.evaluate(testing))

import nltk
from nltk.tag import BigramTagger
from nltk.corpus import treebank
training_1= treebank.tagged_sents()[:7000]
bigramtagger=BigramTagger(training_1)
print(treebank.sents()[0])
print(bigramtagger.tag(treebank.sents()[0]))
testing_1 = treebank.tagged_sents()[2000:]
print(bigramtagger.evaluate(testing_1))

import nltk
from nltk.tag import BigramTagger, TrigramTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
bigramtag = BigramTagger(training)
print(bigramtag.evaluate(testing))
trigramtag = TrigramTagger(training)
print(trigramtag.evaluate(testing))

import nltk
from nltk.corpus import treebank
from nltk import NgramTagger
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
quadgramtag = NgramTagger(4, training)
print(quadgramtag.evaluate(testing))

import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
affixtag = AffixTagger(training)
print(affixtag.evaluate(testing))

import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
prefixtag = AffixTagger(training, affix_length=4)
print(prefixtag.evaluate(testing))

import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
suffixtag = AffixTagger(training, affix_length=-3)
print(suffixtag.evaluate(testing))

import nltk
from nltk.tag import AffixTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
prefixtagger=AffixTagger(training,affix_length=4)
prefixtagger3=AffixTagger(training,affix_length=3,backoff=prefixtagger)
print(prefixtagger3.evaluate(testing))
suffixtagger3=AffixTagger(training,affix_length=-3,backoff=prefixtagger3)
print(suffixtagger3.evaluate(testing))
suffixtagger4=AffixTagger(training,affix_length=-4,backoff=suffixtagger3)
print(suffixtagger4.evaluate(testing))

import nltk
from nltk.tag import tnt
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tnt_tagger=tnt.TnT()
tnt_tagger.train(training)
print(tnt_tagger.evaluate(testing))

import nltk
from nltk.tag import DefaultTagger
from nltk.tag import tnt
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tnt_tagger=tnt.TnT()
unknown=DefaultTagger('NN')
tagger_tnt=tnt.TnT(unk=unknown,Trained=True)
tnt_tagger.train(training)
print(tnt_tagger.evaluate(testing))

import nltk
sent=[("A","DT"),("wise", "JJ"), ("small", "JJ"),("girl", "NN"), ("of", "IN"), ("village", "N"), ("became", "VBD"), ("leader", "NN")]
grammar = "NP: {<DT>?<JJ>*<NN><IN>?<NN>*}"
find = nltk.RegexpParser(grammar)
res = find.parse(sent)
print(res)
res.draw() #画出

import nltk
noun1=[("financial","NN"),("year","NN"),("account","NN"),("summary","NN")]
gram="NP:{<NN>+}"
find = nltk.RegexpParser(gram)
print(find.parse(noun1))
x=find.parse(noun1)
x.draw()

import nltk
import nltk.corpus
print(str(nltk.corpus.treebank).replace('\\\\','/'))
print(nltk.corpus.treebank.fileids())
from nltk.corpus import treebank
print(treebank.words('wsj_0007.mrg'))
print(treebank.tagged_words('wsj_0007.mrg'))

import nltk
from nltk.corpus import treebank
print(treebank.parsed_sents('wsj_0007.mrg')[2])

import nltk
from nltk.corpus import treebank_chunk
print(treebank_chunk.chunked_sents()[1].leaves())
print(treebank_chunk.chunked_sents()[1].pos())
print(treebank_chunk.chunked_sents()[1].productions())
print(nltk.corpus.treebank.tagged_words())

import nltk
from nltk.probability import FreqDist
from nltk.corpus import treebank
fd = FreqDist()
print(fd.items())

import nltk
from nltk.corpus import sinica_treebank
print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])

import nltk
from nltk import Nonterminal, nonterminals, Production, CFG
nonterminal1 = Nonterminal('NP')
nonterminal2 = Nonterminal('VP')
nonterminal3 = Nonterminal('PP')
print(nonterminal1.symbol())
print(nonterminal2.symbol())
print(nonterminal3.symbol())
print(nonterminal1==nonterminal2)
print(nonterminal2==nonterminal3)
print(nonterminal1==nonterminal3)
S, NP, VP, PP = nonterminals('S, NP, VP, PP')
N, V, P, DT = nonterminals('N, V, P, DT')
production1 = Production(S, [NP, VP])
production2 = Production(NP, [DT, NP])
production3 = Production(VP, [V, NP,NP,PP])
print(production1.lhs())
print(production1.rhs())
print(production3.lhs())
print(production3.rhs())
print(production3 == Production(VP, [V,NP,NP,PP]))
print(production2 == production3)

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
print(gram1)

import nltk
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
print(len(sent))
testingsent=sent[25]
print(testingsent[1])
print(testingsent[0])
sent=testingsent[0]

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser1 = nltk.parse.BottomUpChartParser(gram1)
chart1 = parser1.chart_parse(sent)
print((chart1.num_edges()))
print((len(list(chart1.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser2 = nltk.parse.BottomUpLeftCornerChartParser(gram1)
chart2 = parser2.chart_parse(sent)
print((chart2.num_edges()))
print((len(list(chart2.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser3 = nltk.parse.LeftCornerChartParser(gram1)
chart3 = parser3.chart_parse(sent)
print((chart3.num_edges()))
print((len(list(chart3.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser4 = nltk.parse.TopDownChartParser(gram1)
chart4 = parser4.chart_parse(sent)
print((chart4.num_edges()))
print((len(list(chart4.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser5 = nltk.parse.IncrementalBottomUpChartParser(gram1)
chart5 = parser5.chart_parse(sent)
print((chart5.num_edges()))
print((len(list(chart5.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser6 = nltk.parse.IncrementalBottomUpLeftCornerChartParser(gram1)
chart6 = parser6.chart_parse(sent)
print((chart6.num_edges()))
print((len(list(chart6.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser7 = nltk.parse.IncrementalLeftCornerChartParser(gram1)
chart7 = parser7.chart_parse(sent)
print((chart7.num_edges()))
print((len(list(chart7.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser8 = nltk.parse.IncrementalTopDownChartParser(gram1)
chart8 = parser8.chart_parse(sent)
print((chart8.num_edges()))
print((len(list(chart8.parses(gram1.start())))))

import nltk
gram1 = nltk.data.load('grammars/large_grammars/atis.cfg')
sent = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sent = nltk.parse.util.extract_test_sentences(sent)
testingsent=sent[25]
sent=testingsent[0]
parser9 = nltk.parse.EarleyChartParser(gram1)
chart9 = parser9.chart_parse(sent)
print((chart9.num_edges()))
print((len(list(chart9.parses(gram1.start())))))

import nltk
from nltk.corpus import treebank
from itertools import islice
from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
gram2 = PCFG.fromstring("""
A -> B B [.3] | C B C [.7]
B -> B D [.5] | C [.5]
C -> 'a' [.1] | 'b' [0.9]
D -> 'b' [1.0]
""")
prod1 = gram2.productions()[0]
print(prod1)
prod2 = gram2.productions()[1]
print(prod2)
print(prod2.lhs())
print(prod2.rhs())
print((prod2.prob()))
print(gram2.start())
print(gram2.productions())

import nltk
from nltk.corpus import treebank
from itertools import islice
from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2
tokens = "Jack told Bob to bring my cookie".split()
grammar = toy_pcfg2
print(grammar)

import nltk
nltk.parse.chart.demo(5, print_times=False, trace=1,sent='John saw a dog', numparses=2)

import nltk
nltk.parse.chart.demo(2, print_times=False, trace=1,sent='John saw a dog', numparses=2)

import nltk
nltk.parse.featurechart.demo(print_times=False,print_grammar=True,parser=nltk.parse.featurechart.FeatureChartParser,sent='I saw a dog')

import nltk
nltk.boolean_ops()

import nltk
input_expr = nltk.sem.Expression.fromstring
print(input_expr('X | (Y -> Z)'))
print(input_expr('-(X & Y)'))
print(input_expr('X & Y'))
print(input_expr('X <-> -- X'))

import nltk
value = nltk.Valuation([('X', True), ('Y', False), ('Z', True)])
print(value['Z'])
domain = set()
v = nltk.Assignment(domain)
u = nltk.Model(domain, value)
print(u.evaluate('(X & Y)', v))
print(u.evaluate('-(X & Y)', v))
print(u.evaluate('(X & Z)', v))
print(u.evaluate('(X | Y)', v))

import nltk
input_expr = nltk.sem.Expression.fromstring
expression = input_expr('run(marcus)', type_check=True)
print(expression.argument)
print(expression.argument.type)
print(expression.function)
print(expression.function.type)
sign = {'run': '<e, t>'}
expression = input_expr('run(marcus)', signature=sign)
print(expression.function.type)

import nltk
locations=[('Jaipur', 'IN', 'Rajasthan'),('Ajmer', 'IN', 'Rajasthan'),('Udaipur', 'IN', 'Rajasthan'),('Mumbai', 'IN', 'Maharashtra'),('Ahmedabad', 'IN', 'Gujrat')]
q = [x1 for (x1, relation, x2) in locations if x2=='Rajasthan']
print(q)

from nltk import load_parser
test = load_parser('grammars/book_grammars/sql1.fcfg')
q=" What cities are in Greece"
t = list(test.parse(q.split()))
ans = t[0].label()['SEM']
ans = [s for s in ans if s]
q = ' '.join(ans)
print(q)
from nltk.sem import chat80
r = chat80.sql_query('corpora/city_database/city.db', q)
for p in r:
print(p[0], end=" ")

import nltk
sentences1 = nltk.corpus.treebank.tagged_sents()[17]
print(nltk.ne_chunk(sentences1, binary=True))
sentences2 = nltk.corpus.treebank.tagged_sents()[7]
print(nltk.ne_chunk(sentences2, binary=True))
print(nltk.ne_chunk(sentences2))

import nltk
from nltk.corpus import conll2002
for documents in conll2002.chunked_sents('ned.train')[25]:
print(documents)

import nltk
sentence = "I went to Greece to meet John";
tok=nltk.word_tokenize(sentence)
pos_tag=nltk.pos_tag(tok)
print(nltk.ne_chunk(pos_tag))

import nltk
nltk.tag.hmm.demo_pos()

import nltk
from nltk import pos_tag, word_tokenize
print(pos_tag(word_tokenize("John and Smith are going to NY and Germany")))

import nltk
from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700])
sentence = ['John','and','Smith','went','to','NY','and','Germany']
for word, tag in tagger.tag(sentence):
print(word,'->',tag)

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
wn.synsets('cat')
wn.synsets('cat', pos=wn.VERB)
wn.synset('cat.n.01')
print(wn.synset('cat.n.01').definition())
print(len(wn.synset('cat.n.01').examples()))
print(wn.synset('cat.n.01').lemmas())
print([str(lemma.name()) for lemma in wn.synset('cat.n.01').lemmas()])
print(wn.lemma('cat.n.01.cat').synset())

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
print(sorted(wn.langs()))
print(wn.synset('cat.n.01').lemma_names('ita'))
print(sorted(wn.synset('cat.n.01').lemmas('dan')))
print(sorted(wn.synset('cat.n.01').lemmas('por')))
print(len(wordnet.all_lemma_names(pos='n', lang='jpn')))
cat = wn.synset('cat.n.01')
print(cat.hypernyms())
print(cat.hyponyms())
print(cat.member_holonyms())
print(cat.root_hypernyms())
print(wn.synset('cat.n.01').lowest_common_hypernyms(wn.synset('dog.n.01')))

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.path_similarity(cat))

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.lch_similarity(cat))

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.wup_similarity(cat))

import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
from nltk.corpus import genesis
genesis_ic = wn.ic(genesis, False, 0.0)
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.res_similarity(cat, brown_ic))
print(lion.res_similarity(cat, genesis_ic))
print(lion.jcn_similarity(cat, brown_ic))
print(lion.jcn_similarity(cat, genesis_ic))
print(lion.lin_similarity(cat, semcor_ic))

import nltk
import random
from nltk.corpus import movie_reviews
docs = [(list(movie_reviews.words(fid)), cat) for cat in movie_reviews.categories() for fid in movie_reviews.fileids(cat)]
random.shuffle(docs)
all_tokens = nltk.FreqDist(x.lower() for x in movie_reviews.words())
token_features = list(all_tokens.keys())[:2000]
print(token_features[:100])

def doc_features(docs):
doc_words = set(docs)
features = {}
for word in token_features:
features['contains(%s)' % word] = (word in doc_words)
return features

print(doc_features(movie_reviews.words('pos/cv957_8737.txt')))
feature_sets = [(doc_features(d), c) for (d,c) in docs]
train_sets, test_sets = feature_sets[100:], feature_sets[:100]
classifiers = nltk.NaiveBayesClassifier.train(train_sets)
print(nltk.classify.accuracy(classifiers, test_sets))
classifiers.show_most_informative_features(5)

import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))
def not_stopwords(text):
stopwords = nltk.corpus.stopwords.words('english')
content = [w for w in text if w.lower() not in stopwords]
return len(content) / len(text)
print(not_stopwords(nltk.corpus.reuters.words()))

import nltk
expr_read = nltk.sem.DrtExpression.fromstring
expr1 = expr_read('([x], [John(x), Went(x)])')
print(expr1)
expr1.draw()
print(expr1.fol())

import nltk
expr_read = nltk.sem.DrtExpression.fromstring
expr2 = expr_read('([x,y], [John(x), Went(x),Sam(y),Meet(x,y)])')
print(expr2)
expr2.draw()
print(expr2.fol())

import nltk
expr_read = nltk.sem.DrtExpression.fromstring
expr3 = expr_read('([x], [John(x), eats(x)])+ ([y],[Sam(y),eats(y)])')
print(expr3)
print(expr3.simplify())
expr3.draw()

import nltk
expr_read = nltk.sem.DrtExpression.fromstring
expr4 = expr_read('([],[(([x],[student(x)])->([y],[book(y),read(x,y)]))])')
print(expr4.fol())

秒客网

精通Python自然语言处理

相关文章