python爬取网页内容demo

 #html文本提取

 from bs4 import BeautifulSoup

 html_sample = '\

 <html> \

 <body> \

 <h1 id = "title">Hello world</h1>\

 <a href = "#www.baidu.com" class = "link"> This is link1</a>\

 <a href = "#link2" class = "link"> This is link2</a> \

 </body> \

 </html>'

 soup = BeautifulSoup(html_sample,'html.parser')

 print(soup.text)

 soup.select('h1')

 print(soup.select('h1')[0].text)

 print(soup.select('a')[0].text)

 print(soup.select('a')[1].text)

 for alink in soup.select('a'):

     print(alink.text)

 print(soup.select('#title')[0].text)

 print(soup.select('.link')[0].text)

 alinks = soup.select('a')

 for link in alinks:

     print(link['href'])

demo2:

 import requests

 from bs4 import BeautifulSoup

 res = requests.get('http://news.qq.com/')

 soup = BeautifulSoup(res.text,'html.parser')

 newsary = []

 for news in soup.select('.Q-tpWrap .text'):

     newsary.append({'title':news.select('a')[0].text, 'url':news.select('a')[0]['href']})

 import pandas

 newsdf = pandas.DataFrame(newsary)

 newsdf.to_excel('news.xlsx')

推荐使用：Jupyter Notebook 做练习，很方便。

秒客网

python爬取网页内容demo

相关文章