Python 自然语言处理(自学一)

时间:2022-04-07 07:16:15
#coding=utf-8
from __future__ import division
from nltk.book import *

# 打印文本名称
print text1

# 查找文本中的词monstrous
print text2.concordance("monstrous")

# 查找与词monstrous有相似上下文的其他词
print text2.similar("monstrous")

# 查找共用两个以上词汇的上下文
print text2.common_contexts(["monstrous", "very"])

# 显示某些词在文本中的分布图
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

# 获取文本中所有标识符(单词标点)数量
print len(text3)

# 获取文本中项目类型(单词标点去重后)列表及数量
print sorted(set(text3))
print len(set(text3))

# 获取文本中每个词的平均使用次数
print len(text3)/len(set(text3))

# 获取某单词在文本中出现的次数并写为函数
print text3.count("smote")
def lexical_diversity(text):
    return len(text)/len(set(text))

# 获取某单词在文本中占据的百分比并写为函数
print 100 * text4.count("a")/len(text4)
def percentage(count, total):
    return 100 * count/total

# 将文本当作词列表并进行操作
sent1 = ["Call", "me", "Zty", "."]
sent2 = ["Hello NLP!"]
print len(sent1)
print lexical_diversity(sent1)
print sent1 + sent2
sent1.append("some")
print sent1

print text4[173]
print text4.index("awaken")
print text5[16715:16735]

sent = ["1","2","3","4","5","6","7","8","9","10"]
print sent[0], sent[9]
print sent[5:8], sent[:3]
print text2[141525:]
sent[0] = "first"
sent[9] = "last"
sent[1:9] = ["second", "third"]
print sent

my_sent = ["bold", "Sir", "Robin"]
print sorted(my_sent)

name = "Monty"
print name[0]
print name[:4]
print name*2
print name+"!"

print " ".join(["Monty", "Python"])
print "Monty Python".split()

# 利用FreqDist寻找文本中最常见的50个词
fdist1 = FreqDist(text1)
print fdist1
print len(text1)
print len(set(text1))
vocab1 = fdist1.keys()
print vocab1[:50]
print fdist1["whale"]

# 显示累积频率图
fdist1.plot(50, cumulative=True)

# 输出低频词
print fdist1.hapaxes()

# 获取长度大于15个字符的词
V = set(text1)
long_words = [w for w in V if len(w)>15]
print sorted(long_words)

# 获取长度超过7个字符并且出现次数超过7次的词
fdist5 = FreqDist(text5)
print sorted([w for w in set(text5) if len(w)>7 and fdist5[w]>7])

# 获取文本中双连词
print text4.collocations()

# 获取文本词长分布
ls = [len(w) for w in text1]
fdist = FreqDist(ls)
print fdist.keys()
print fdist.items()
print fdist.max()
print fdist[3]
print fdist.freq(3)