python_爬百度百科词条

如何爬取？
　　明确目标：爬取百度百科，定初始百度词条：python，初始URL：http://baike.baidu.com/item/Python，爬取数据量为1000条，值爬取简介，标题，和简介中url
　　怎么爬： 利用谷歌开发工具，分析html结构，分析查询层次与方法
　　怎么写： 面向过程和面向对象两个方向
环境声明：
　　python 3.50 
　　requests 库
　　beautifulsoup 库
使用面向过程的方式爬取：

#!/usr/bin/python3

import re

import bs4

import requests

from bs4 import BeautifulSoup

# 从百度百科爬取数据为三个字段，标题，简介，关联URL

# 给定初始百度词条：python，初始URL：http://baike.baidu.com/item/Python，爬取数据量为1000条

# 那就先有4个模块，URL管理器，下载器，解析器，数据展示

# 通过requests、BeautifulSoup两个库，实现下载器和解析器,通过两个集合数据类型，实现URL管理器

# URL拼接 起始url ：http://baike.baidu.com

# new_urls = set()

# old_urls = set()

# 已经在old_urls不再爬取，不在添加到new_urls中并从其中返回一个URL

def url_manager(links):

    if links is not None:

        # 把重复的url去掉

        links = links.difference(old_urls)

        if links is not None:

            for i in links:

                new_urls.add(i)

def download_html(url):

    headers = {

              # 'Host': 'static.tieba.baidu.com',

              'Referer': 'http://baike.baidu.com/item/Python',

              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}

    # 返回下载页面

    try:

        response = requests.get(url, headers=headers)

        response.raise_for_status()

        response.encoding = response.apparent_encoding

        return response.text

    except:

        return None

    pass

def analysis(page_html, one_url):

    # 返回标题，简介，关联URL

    # temp = title + introduction + page_url

    # links = 关联URL

    links = []

    temp_url = 'http://baike.baidu.com'

    soup = BeautifulSoup(page_html, 'html.parser')

    # 获取标题

    title = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1').get_text()

    # print(title)

    # 获取简介

    introduction = soup.find('div', class_="lemma-summary").get_text().replace('\nPython[1]\xa0\n（英国发音：/ˈpaɪθən/ 美国发音：/ˈpaɪθɑːn/）,', '')

    # print(introduction)

    # 获得关联URL，只爬取简介中关联的URL

    links_labl = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/"))

    # links_text = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/"))

    # for i in links_text:

    #     print(i.get_text())

    for link in links_labl:

        new_url = temp_url + link['href']

        links.append(new_url)

    temp = one_url + ' : ' + title + '_' + introduction

    message.append(temp)

    if links is not None:

        links = set(links)

    else:

        links = None

    return links

def out_data():

    for i in message:

        print(i)

    pass

if __name__ == '__main__':

    new_urls = set()

    old_urls = set()

    message = []

    start_url = 'http://baike.baidu.com/item/Python'

    # 起始页

    page_html = download_html(start_url)

    links = analysis(page_html, start_url)

    url_manager(links)

    # 起始页简介中URL

    for i in range(100):

        url = new_urls.pop()

        try:

            page_html = download_html(url)

            if not page_html:

                continue

            urls = analysis(page_html, url)

            url_manager(urls)

        except:

            print('爬取失败')

        old_urls.add(url)

    # 依次打印爬取到的值

    out_data()

秒客网

python_爬百度百科词条

相关文章