bs4爬虫入门

 # -*- coding: utf-8 -*-

 """

 Created on Fri Nov 16 13:35:33 2018

 @author: zhen

 """

 import urllib

 import urllib.request

 from bs4 import BeautifulSoup

 # 设置目标rootUrl，使用urllib.request.Request创建请求

 rootUrl = "https://www.cnblogs.com/"

 request = urllib.request.Request(rootUrl)

 header = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"

 # 使用add_header设置请求头，将代码伪装成浏览器

 request.add_header("User-Agent", header)

 # 使用urllib.request.urlopen打开页面，使用read方法保存html代码

 htmlUrl = urllib.request.urlopen(request).read()

 # 使用BeautifulSoup创建html代码的BeautifulSoup实例，存为beautifulSoup

 beautifulSoup = BeautifulSoup(htmlUrl)

 # 获取尾页（对照前一小节获取尾页的内容看你就明白了）

 total_page = int(beautifulSoup.find("div",class_= "pager").findAll("a")[-2].get_text())

 list_item = beautifulSoup.findAll("a",class_="titlelnk")

 for i in list_item: # 遍历所有的内容

     href = i["href"] # 获取对应的href

     req = urllib.request.Request(href)

     req.add_header("User-Agent", header)

     html = urllib.request.urlopen(req).read()

     soup = BeautifulSoup(html)

     # 获取标题

     titleContent = soup.find("a", id="cb_post_title_url")

     if titleContent is not None: # 判读是否为空

         title = titleContent.get_text()

         # 获取内容

         content = soup.find("div").get_text().strip()

         print(title, "\n=====================================\n", content[1:100])
bs4爬虫入门
爬虫结果：
bs4爬虫入门
相关文章