NLTK04《Python自然语言处理》code03 处理原始文本

时间:2022-02-20 07:28:07

处理原始文本

# -*- coding: utf-8 -*-
# win10 python3.5.3/python3.6.1 nltk3.2.4
# 《Python自然语言处理》 03 处理原始文本
# pnlp03.py

from __future__ import division
import nltk, re, pprint

# 3.1 从网络和硬盘访问文本
# 电子书
# pip3 install urllib3
from urllib.request import urlopen
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
raw = urlopen(url).read()
print(type(raw)) # <class 'bytes'>
#pprint.pprint(raw)
print(len(raw)) # 1201733

raw = raw.decode('utf-8')
tokens = nltk.word_tokenize(raw)
print(type(tokens)) # <class 'list'>
print(tokens[:10]) # ['The', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']

text = nltk.Text(tokens)
print(type(text)) # <class 'nltk.text.Text'>
print(text[1020:1040])
# ['AND', 'PUNISHMENT', 'PART', 'I', 'CHAPTER', 'I', 'On', 'an', 'exceptionally', 'hot', 'evening', 'early', ...

text.collocations()

n = raw.find("PART I")
print(n) # 5336
n = raw.rfind("End of Project Gutenberg's Crime")
print(n) # -1
raw = raw[5303:1157681]
n = raw.find("PART I")
print(n) # 33

# 处理HTML
from urllib.request import urlopen
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
print(html[:5]) # b'<!doc'

# https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
import bs4
import lxml
#raw = nltk.clean_html(html)
raw = bs4.BeautifulSoup(html, "lxml")
raw = raw.get_text()
tokens = nltk.word_tokenize(raw)
print(tokens)
# ['BBC', 'NEWS', '|', 'Health', '|', 'Blondes', "'to", ...
tokens = tokens[96:399]
text = nltk.Text(tokens)
text.concordance('gene')
# Displaying 5 of 5 matches:hey say too few people now carry the gene for blondes to last beyond the next ...

# 处理搜索引擎的结果
# 处理RSS订阅
# https://pypi.python.org/pypi/feedparser/
import feedparser
import bs4
import lxml
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
print(llog['feed']['title']) # Language Log
print(len(llog.entries)) # 13
post = llog.entries[2]
print(post.title) # Asses and asterisks
content = post.content[0].value
print(content[:70]) # <p>When <em>The Sun</em>, a famously prurient UK tabloid newspaper, ch
txt = bs4.BeautifulSoup(content, "lxml").get_text()
t0 = nltk.word_tokenize(txt)
print(t0)
# ['When', 'The', 'Sun', ',', 'a', 'famously', 'prurient', 'UK', 'tabloid', ...
t1 = nltk.word_tokenize(bs4.BeautifulSoup(llog.entries[2].content[0].value, "lxml").get_text())
print(t1)
# ['When', 'The', 'Sun', ',', 'a', 'famously', 'prurient', 'UK', ...

# 读取本地文件
# document.txt
# test document.txt
f = open('document.txt')
raw = f.read()
print(raw) # test document.txt
f.close()

f = open('document.txt')
for line in f:
    print(line.strip())
f.close()

path = nltk.data.find('corpora/abc/rural.txt')
raw = open(path, 'rU').read()
print(raw[:20]) # PM denies knowledge

# 从PDF、MS Word及其他二进制格式中提取文本
# pypdf pywin32
# 获取用户输入
s = input("Enter some text: ")
print(type(s), s)

# NLP的流程
raw = open('document.txt').read()
print(type(raw)) # <class 'str'>
tokens = nltk.word_tokenize(raw)
print(type(tokens)) # <class 'list'>
words = [w.lower() for w in tokens]
print(type(words), words) # <class 'list'> ['test', 'document.txt']
vocab = sorted(set(words))
print(type(vocab), vocab) # <class 'list'> ['document.txt', 'test']

vocab.append('blog')
# raw.append('blog') # AttributeError: 'str' object has no attribute 'append'

# 字符串和列表不能连接
query = 'Who knows?'
beatles = ['john', 'paul', 'georage', 'ringo']
# s = query + beatles # TypeError: must be str, not list

# 3.2 字符串:最底层的文本处理
# 字符串的基本操作
monty = 'Monty Python'
print(monty) # Monty Python
circus = "Monty Python's Flying Circus"
print(circus) # Monty Python's Flying Circus
circus = 'Monty Python\'s Flying Circus'
print(circus) # Monty Python's Flying Circus
## circus = 'Monty Python's Flying Circus'

couplet = "Shall I compare thee to a Summer's day?"\
    "Thou are more Lovely and more temperate:"
print(couplet)
# Shall I compare thee to a Summer's day?Thou are more Lovely and more temperate:
couplet = ("Rough winds do shake the darling duds of May, "
           "And Summer's lease hath all too short a date:")
print(couplet)
# Rough winds do shake the darling duds of May, And Summer's lease hath all too short a date:

couplet = """Shall I compare thee to a Summer's day? Thou are more lovely and more temperate:"""
print(couplet)
#Shall I compare thee to a Summer's day?
#Thou are more lovely and more temperate:

print('very' + 'very' + 'very') # veryveryvery
print('very'*3) # veryveryvery

a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]
b = [' ' * 2 * (7 - i) + 'very' * i for i in a]
for line in b:
    print(line)

# s = 'very' - 'y' # TypeError: unsupported operand type(s) for -: 'str' and 'str'
# s = 'very' / 2 # TypeError: unsupported operand type(s) for /: 'str' and 'int'

# 输出字符串
monty = 'Monty Python'
print(monty) # Monty Python
grail = 'Holy Grail'
print(monty + grail) # Monty PythonHoly Grail
print(monty, "and the", grail) # Monty Python and the Holy Grail

# 访问单个字符
print(monty[0]) # M
print(monty[3]) # t
print(monty[5]) # ' '
#print(monty[20]) # IndexError: string index out of range
print(monty[-1]) # n
print(monty[5]) # ' '
print(monty[-7]) # ' '

sent = 'colorless green ideas sleep furiously'
for char in sent:
    print(char, end='') # 结尾不要换行符

from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
print(list(fdist.keys()))
# ['m', 'o', 'b', 'y', 'd', 'i', 'c', 'k', 'h', 'e', 'r', 'a', 'n', 'l', 'v', 't', 'g', 's', 'u', 'p', 'w', 'x', 'q', 'f', 'j', 'z']

# 访问子字符串
print(monty[6:10]) # Pyth
print(monty[-12:-7]) # Monty
print(monty[:5]) # Monty
print(monty[6:]) # Python

phrase = 'And now for something completely different'
if 'thing' in phrase:
    print('found "thing"')
# found "thing"

# 更多字符串操作
# s.find(t) 字符串中包含t的第一个索引(没找到返回-1)
# s.rfind(t) 字符串s中是包含t的最后一个索引(没找到返回-1)
# s.index(t) 与s.find(t)功能类似,但没有找到时,引起ValueError
# s.rindex(t) 与s.rfind(t)功能类似,但没有找到时,引起ValueError
# s.join(text) 连接字符串s与text中的词汇
# s.split(t) 在所有找到t的位置将s分割成链表(默认为空白符)
# s.splitlines() 将s按行分割成字符串链表
# s.lower() 将字符串s小写
# s.upper() 将字符串s大写
# s.titilecase() 将字符串s首字符大写
# s.strip() 返回一个没有首尾空白字符的s的复制
# s.replace(t, u) 用u替换s中的t

# 链表与字符串的差异
query = 'Who knows?'
beatles = ['John', 'Paul', 'George', 'Ringo']
print(query[2]) # 'o'
print(beatles[2]) # 'George'
print(query[:2]) # 'Wh'
print(beatles[:2]) # ['John', 'Paul']
print(query + " I dot't") # "Who knows? I don't"
#print(beatles + 'Brian') # TypeError: can only concatenate list (not "str") to list
print(beatles + ['Brian']) #

beatles[0] = "John Lennon"
del beatles[-1]
print(beatles) #
# 字符串是不可变的
beatles[0] = 'F' #

# 3.3 使用unicode进行文字处理
# 什么是unicode
# 从文件中提取自己编码文本
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
import codecs
f = codecs.open(path, encoding='latin2')
for line in f:
    line = line.strip()
    print(line.encode('unicode_escape'))

print(ord('a')) # 97
a = u'\u0061'
print(a) #

nacute = u'\u0144'
print(nacute) #
nacute_utf = nacute.encode('utf8')
print(repr(nacute_utf)) #

import unicodedata
lines = codecs.open(path, encoding = 'latin2').readlines()
line = lines[2]
print(line.encode('unicode_escape'))
for c in line:
    if ord(c) > 127:
        print('%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c)))

print(line.find(u'zosta\u0142y'))
line = line.lower()
print(line.encode('unicode_escape'))
import re
m = re.search(u'\u015b\w*', line)
print(m.group())
nltk.word_tokenize(line)

# 在Python中使用本地编码
# '#-*-coding:<utf-8>-*-'

# 3.4 使用正则表达式检测词组搭配
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]
print(wordlist)

# 使用基本的元字符
# <<ed$>>表示以ed结尾的词汇
res = [w for w in wordlist if re.search('ed$', w)]
print(res)
# 通配符"."匹配任意单个字符
res = [w for w in wordlist if re.search('^..j..t..$', w)]
print(res)

# 范围与闭包
res = [w for w in wordlist if re.search('^[hgi][mno][jlk][def]$', w)]
print(res)

chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
res = [w for w in chat_words if re.search('^m+i+n+e+$', w)]
print(res)

res = [w for w in chat_words if re.search('^[ha]+$', w)]
print(res)

wsj = sorted(set(nltk.corpus.treebank.words()))
res = [w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]
print(res)
res = [w for w in wsj if re.search('^[A-Z]+$', w)]
print(res)
res = [w for w in wsj if re.search('^[0-9]{4}$', w)]
print(res)
res = [w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]
print(res)
res = [w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
print(res)
res = [w for w in wsj if re.search('(ed|ing)$', w)]
print(res)

# . 通配符,匹配所有字符
# ^abc 匹配以abc开始的字符串
# abc$ 匹配以abc结尾的字符串
# [abc] 匹配字符合集
# [A-Z0-9] 匹配字符范围
# ed|ing|s 匹配指定字符串(析取)
# * 前面的项目另个或多个,如a*、[a-z]*(也叫Kleene闭包)
# + 前面的项目1个或多个,如a+、[a-z]+
# ? 前面的项目0个或1个(即:可选),如:a?、[z-a]?
# {n} 重复n次,n为非负整数
# {n, } 至少重复n次
# {, n} 之多重复n次
# {m, n} 至少重复m次,不多于n次
# a(b|c)+ 括号表示操作符的范围

# 3.5 正则表达式的有益应用
# 提取字符块
word = 'supercalifragilisticexpialidocious'
res = re.findall(r'[aeiou]', word)
print(res)
print(len(res))

wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))
fd.items()

# 在字符串上做更多事情
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

    english_udhr = nltk.corpus.udhr.words('English-Latin1')
    print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]
cfd = nltk.ConditionalFreqDist(cvs)
cfd.tabulate()

cv_word_pairs = [(cv, w) for w in rotokas_words
                 for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index = nltk.Index(cv_word_pairs)
print(cv_index['su'])

# 查找词干
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]

re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$'
    res = re.findall(regexp, word)
    if len(res) > 0:
        stem = res[0]
    else:
        stem = None
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords ... is no basis for a system of government. Supreme executive power derivers from ... a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
print(tokens)
res = [stem(t) for t in tokens if len(t) > 0]
print(res)

# 搜索已分词文本
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")
chat.findall(r"<l.*>{3,}")

from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

# 3.6 规范化文本
raw = """DENNIS: Listen, strange women lying in ponds distributing swords ... is no basis for a system of government. Supreme executive power derivers from ... a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
# 词干提取器
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
res = [porter.stem(t) for t in tokens]
print(res)
res = [lancaster.stem(t) for t in tokens]
print(res)

class IndexedText(object):
    def __init__(self, stemmer, text):
        self._text = text
        self._stemmer = stemmer
        self._index = nltk.Index((self._stem(word), i)
                                 for (i, word) in enumerate(text))

    def concordance(self, word, width = 40):
        key = self._stem(word)
        wc = int(width / 4)
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i-wc:i])
            rcontext = ' '.join(self._text[i:i+wc])
            ldisplay = '%*s' % (width, lcontext[-width:])
            rdisplay = '%-*s' % (width, rcontext[:width])
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()

porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')
text = IndexedText(porter, grail)
text.concordance("lie")

# 词形归并
wnl = nltk.WordNetLemmatizer()
res = [wnl.lemmatize(t) for t in tokens]
print(res)

# 3.7 用正则表达式为文本分词
# 分词的简单方法
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone ... though), 'I won't have any pepper in my kitchen AT ALL. Soup does very ... well without--Maybe it's always pepper that makes people hot-tempered,'..."""

re.split(r' ', raw)
re.split(r'[ \t\n]+', raw)
re.split(r'\W+', raw)
re.findall(r'\w+|\S\w*', raw)
re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw)

# NLTK的正则表达式分词器 nltk.regexp_tokenize()
text = 'That U.S.A poster-print costs $12.40...'
pattern = r"""(?x) ([A-Z]\.)+ | \w+(-\w+)* | \$?\d+(\.\d+)?%? | \.\.\. | [][.,;"'?():-_`] """
nltk.regexp_tokenize(text, pattern)

# 3.8 分割
# 断句
l1 = len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())
print(l1)

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
pprint.pprint(sents[171:181])

# 分词
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i + 1
    words.append(text[last:])
    return words
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
res = segment(text, seg1)
print(res)
res = segment(text, seg2)
print(res)

def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
res = segment(text, seg3)
print(res)
res = evaluate(text, seg3)
print(res)
res = evaluate(text, seg2)
print(res)
res = evaluate(text, seg1)
print(res)

from random import randint

def flip(segs, pos):
    return segs[:pos] + str(1 - int(segs[pos])) + segs[pos + 1:]

def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0, len(segs)-1))
    return segs

def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print("")
    return segs
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)

# 3.9 格式化:从链表到字符串
# 从链表到字符串
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']
res = ' '.join(silly)
print(res) # We called him Tortoise because he taught us .
res = ';'.join(silly)
print(res) # We;called;him;Tortoise;because;he;taught;us;.
res = ''.join(silly)
print(res) # WecalledhimTortoisebecausehetaughtus.

# 字符串与格式化
word = 'cat'
sentence = """hello world"""
print(word)
print(sentence)

fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])
for word in fdist:
    print(word, '->', fdist[word], ";", end='')
print("")

for word in fdist:
    print('%s->%d;' % (word, fdist[word]), end='')
print("")

template = 'Lee wants a %s right now'
menu = ['sandwich', 'spam fritter', 'pancake']
for snack in menu:
    print(template % snack)

# 排列
print('%6s' % 'dog')
print('%-6s' % 'dog')
width = 6
print('%-*s' % (width, 'dog'))

count, total = 3205, 9375
print("accuracy for %d words: %2.4f%%" % (total, 100*count/total))

# 这种打印排版格式比较常用
def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category', end='')
    for word in words:
        print('%6s' % word, end='')
    print("")
    for category in categories:
        print('%-16s' % category, end='')
        for word in words:
            print('%6d' % cfdist[category][word], end='')
        print("")

from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
tabulate(cfd, modals, genres)

# 将结果写入文件
output_file = open('output.txt', 'w')
words = set(nltk.corpus.genesis.words('english-kjv.txt'))
for word in sorted(words):
    output_file.write(word + "\n")
print(len(words))
print(str(len(words)))
output_file.write(str(len(words)) + "\n")
output_file.close()

# 文本换行
from textwrap import fill
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.']
for word in saying:
    print(word, '(' + str(len(word)) + ')', end="")
print("")
format = '%s (%d),'
pieces = [format % (word, len(word)) for word in saying]
output = ' '.join(pieces)
wrapped = fill(output)
print(wrapped)