一个python小爬虫

自定义获取豆瓣网电影TOP250里的排名数量

主要思路：先由requests库获取html基本信息，然后用BeautifulSoup来进行html.parser格式解析，逐个获取Tag属性，并且对内容进行字符串切片，字符串匹配，保存得到电影名称，地点，日期。

from bs4 import BeautifulSoup

import requests

import re

#由于中文编码的原因需要重新计算中文所占长度

def SuperLen(s):

    sum_len=len(s)

    chi_len=len(re.sub('[a-zA-Z]','',s))

    eng_len=sum_len-chi_len

    return (chi_len*2+eng_len)

#此函数功能是准确得到中英文混合字符串所占长度

#由于每页只有25个电影，所以用户需求电影数目不同页数（URL）也就不同

def Get_Page(n):

    if n%25!=0:

        num=int(n/25)+1

    else:

        num=int(n/25)

    return num

#此函数功能是获取页数

#根据所需参数页数来对应不同的URL

def Num_Get_Soup(n):

    url='https://movie.douban.com/top250?start='+str(n)+'&filter='

    resul=requests.get(url)

    soup=BeautifulSoup(resul.text,'html.parser')

    return soup

#此函数功能就是对URL进行requests.get 以获取对应的BeautifulSoup

#使用BeautifulSoup

def Get_Name_Date_Locat(nums):

    NAME=[]

    Date=[]

    Locat=[]

    for j in range(Get_Page(nums)):  #获取页数，进行循环操作

        soup=Num_Get_Soup(j*25)      #根据不同页数的不同URL获取soup

        Name_temp=soup.find_all('img','',limit=25)    #获取名称Tag，数目最大且为25

        Date_Locat_temp=soup.find_all('p','',limit=25)#获取日期 地点Tag

        for i in Name_temp:

            NAME.append(i.attrs['alt'])  #对获取的Tag获取属性

        for l in Date_Locat_temp:

            stemp=str(l)

            Date.append(re.sub('\D','',stemp.split('\xa0')[-5])[:4]) #对字符串切片后进行字符串匹配获取数字日期

            Locat.append(stemp.split('\xa0')[-3])  #字符串切片获取地点

    return (NAME,Date,Locat)

#此函数功能是格式化写入文件

def Write_text(n,N,D,L):

    f=open('result.txt','w')

    for i  in range(n):

        f.write(N[i]+'{}\t'.format((35-SuperLen(N[i]))*' '))

        f.write(L[i]+'{}\t'.format((70-len(L[i]*2))*' '))

        f.write(D[i])

        f.write('\n')

    f.close()

#函数的开始

def Start():

    nums=eval(input('请输入要爬取排名的个数0-250均可'))

    print('Please Waitng........')

    Name,Date,Locat=Get_Name_Date_Locat(nums)

    Write_text(nums,Name,Date,Locat)

    print('Complete!')

Start()

效果图：

一个python小爬虫

可改进思路：正则表达和字符串切片的使用可以再合理一些，格式化保存文件也可以再简化。

秒客网

一个python小爬虫

相关文章