顺企网爬取16W数据保存到Mongodb

import requests

from bs4 import BeautifulSoup

import pymongo

from multiprocessing.dummy import Pool as ThreadPool

headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}

# 定义数据库

client = pymongo.MongoClient('localhost',27017)

conpany_info = client['conpany_info']  # 给数据库命名

sheet_table = conpany_info['sheet_table']  # 创建表单

def jiexi(url):

    info = {}

    res = requests.get(url,headers=headers)

    if res.status_code != 404:

        soup = BeautifulSoup(res.text,'lxml')

        key = [b.text for b in soup.select('.codl dd')]

        value = [p.text for p in soup.select('.codl dt')]

        for k,v in zip(value,key):

            info[k.strip('：')] = v

        return info

urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,160998))

def get_all_data(url):

    try:

        result = jiexi(url)

        if result:

            sheet_table.insert(result)

            print ('获取了 ' + str (sheet_table.find ().count ()) + '条数据')

    except Exception as e :

        print(e,url)

if __name__ == "__main__":

    pool = ThreadPool(4)

    results = pool.map(get_all_data,urls)

    pool.close()

    pool.join()
顺企网 爬取16W数据保存到Mongodb

顺企网爬取16W数据保存到Mongodb