Python爬虫学习_多进程爬取58同城

时间:2022-03-30 03:39:01

思路:有多个频道(类别),每个频道下有多个商品链接,每个商品都有详情页。先将频道链接中的多个商品链接爬下来放入数据库中,再从数据库中取出来每一个商品详情页链接,进行详情页中的信息爬取

 

首先是channel_extact.py,爬取不同频道的链接

from bs4 import BeautifulSoup
import requests

start_url = 'http://bj.58.com/sale.shtml'
url_host = 'http://bj.58.com'

def get_index_url(url):     #提取导航栏的链接,不同频道有不同的页面列表
    # url = start_url
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    links = soup.select('ul.ym-submnu > li > b > a')        #抓取所有导航栏的链接
    # print(links)
    for link in links:
        page_url = url_host + link.get('href')        #补全链接
        print(page_url)

get_index_url(start_url)

#长字符串,去除了手机号
channel_list = '''
    http://bj.58.com/shouji/
    http://bj.58.com/tongxunyw/
    http://bj.58.com/diannao/
    http://bj.58.com/bijiben/
    http://bj.58.com/pbdn/
    http://bj.58.com/diannaopeijian/
    http://bj.58.com/zhoubianshebei/
    http://bj.58.com/shuma/
    http://bj.58.com/shumaxiangji/
    http://bj.58.com/mpsanmpsi/
    http://bj.58.com/youxiji/
    http://bj.58.com/jiadian/
    http://bj.58.com/dianshiji/
    http://bj.58.com/ershoukongtiao/
    http://bj.58.com/xiyiji/
    http://bj.58.com/bingxiang/
    http://bj.58.com/binggui/
    http://bj.58.com/chuang/
    http://bj.58.com/ershoujiaju/
    http://bj.58.com/yingyou/
    http://bj.58.com/yingeryongpin/
    http://bj.58.com/muyingweiyang/
    http://bj.58.com/muyingtongchuang/
    http://bj.58.com/yunfuyongpin/
    http://bj.58.com/fushi/
    http://bj.58.com/nanzhuang/
    http://bj.58.com/fsxiemao/
    http://bj.58.com/xiangbao/
    http://bj.58.com/meirong/
    http://bj.58.com/yishu/
    http://bj.58.com/shufahuihua/
    http://bj.58.com/zhubaoshipin/
    http://bj.58.com/yuqi/
    http://bj.58.com/tushu/
    http://bj.58.com/tushubook/
    http://bj.58.com/wenti/
    http://bj.58.com/yundongfushi/
    http://bj.58.com/jianshenqixie/
    http://bj.58.com/huju/
    http://bj.58.com/qiulei/
    http://bj.58.com/yueqi/
    http://bj.58.com/bangongshebei/
    http://bj.58.com/diannaohaocai/
    http://bj.58.com/bangongjiaju/
    http://bj.58.com/ershoushebei/
    http://bj.58.com/danche/
    http://bj.58.com/fzixingche/
    http://bj.58.com/diandongche/
    http://bj.58.com/sanlunche/
    http://bj.58.com/peijianzhuangbei/
    http://bj.58.com/tiaozao/
'''


然后是pages_parsing.py,两个爬虫一个将频道链接下的所有商品链接放入数据库中,一个是将详情页中的信息爬取放入数据库中

from bs4 import BeautifulSoup
import requests
import time
import pymongo

#将抓取的channel中的数据放入数据库,然后从数据中取出每一个详情页的链接,用spider2进行爬取,然后放入数据库
client = pymongo.MongoClient('localhost', 27017)
ceshi = client['ceshi']     #创建名称
url_list = ceshi['url_list4']       #创建表
item_info = ceshi['item_info4']     #创建表用于存放每一个详情页中的信息


# 在最左边是在python 中对象的名称,后面的是在数据库中的名称
# spider 1,把一个类目下所有的商品的链接抓下来,但只能获取指定的一页
def get_links_from(channel, pages, who_sells=1):
    # td.t 没有这个(每个商品是一行)就终止,页数爬完了
    # http://bj.58.com/diannao/pn2/
    list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages))
    wb_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    if soup.find('td', 't'):        #考虑到怎么判断爬完
        for link in soup.select('td.t a.t'):        #将页面中的商品链接循环放入数据库
            item_link = link.get('href').split('?')[0]
            url_list.insert_one({'url': item_link})     #将链接插入数据库中
            print(item_link)
# get_links_from('http://bj.58.com/shuma/',2)


# spider 2,爬取商品详情页信息,如何识别404页面
def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    no_longer_exist = '404' in soup.find('script',type = "text/javascript").get('src').split('/')
    if no_longer_exist:     #判断是否存在404页面
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price.c_f50')
        date = soup.select('.time')
        area = soup.select('.c_25d a') if soup.find_all('span','c_25d') else None       #有些没有地址
        areas =[]       #该链表处理多级地址,有待改进
        for  are in area:
            areas.append(are.get_text())
        areas2 =[]      #该链表处理多级地址,有待改进
        for i in range(0,len(areas)-1):
            areas2.append(areas[i]+'-'+areas[i+1])
        for tit, pric, dat, are in zip(title,price,date,area):
            data = {
                'price':pric.get_text(),
                'date':dat.get_text(),
            }
        data['title'] = title
        data['area'] = areas2        #data中怎么存放列表
        data['url'] = url
        item_info.insert_one(data)
        print(data)
# urls = [get_links_from('http://bj.58.com/shouji/',3)]
# for url in urls:
#     get_item_info(url)
url1 = 'http://bj.58.com/shuma/29075926847818x.shtml'
get_item_info(url1)


主函数:main.py,实现多进程爬取

from multiprocessing import Pool        #可以调用电脑的cpu的多个内核完成任务,多进程的导入
from channel_extact import channel_list
from page
s_parsing import get_links_from

def get_all_links_from(channel):        #获取多页,将每一页的数据导入数据库
    for i in range(1,100):
        get_links_from(channel,i)


if __name__ == '__main__':      #固定格式
    pool = Pool()   #创建一个进程池,将程序塞进池子里,就会进行分配cpu,有参数控制进程个数,默认就自动分配
    pool.map(get_all_links_from,channel_list.split())       #map就是将channel_list中的一个一个放入get_all_links_from中




 

一个统计数据库中爬取的商品数量

import time
from pages_parsing import url_list

while True:
    print(url_list.find().count())
    time.sleep(5)