Python爬取ithome的一所有新闻标题评论数及其他一些信息并存入Excel中。

时间:2022-09-18 18:03:56
 1 # coding=utf-8
 2 import numpy as np
 3 import pandas as pd
 4 import sys
 5 
 6 from selenium import webdriver
 7 import time
 8 import requests
 9 import re
10 from openpyxl.workbook import Workbook
11 import matplotlib.pyplot as plt
12 import matplotlib
13 
14 urls = []
15 urls_new = []
16 titles = []
17 titles_new = []
18 days = []
19 comments = []
20 authors = []
21 sources = []
22 comment = []
23 ty = []
24 def save_to_file(file_name, contents):
25     fh = open(file_name, 'w')
26     fh.write(contents)
27     fh.close()
28 
29 url="https://www.ithome.com/"
30 # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
31 #                          '(KHTML,like Gecko) Chrome/50.0.2661.102 Safari/537.36 QIHU 360EE'}
32 headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}
33 rep = requests.get(url,headers=headers)
34 rep.encoding="utf-8"
35 strw=rep.text
36 save_to_file('ithome.html', strw)
37 p = re.compile(r'<div class="lst lst-1 new-list">(.*?)</div>\s*?</div>')
38 m = p.findall(strw)
39 print(len(m[0]))
40 p = re.compile(r'<div class=\"block \d{4} new-list-\d{1}\"(?: style=\".*?\")?><ul>(.*?)</ul></div>')
41 m2 = p.findall(m[0])
42 print(len(m2))
43 
44 broswer = webdriver.Chrome('D:\谷歌\Google\Chrome\Application\chromedriver.exe')
45 
46 for i in m2:
47     m2 = re.findall(r'</span><span class=\"title\">.*?href=\"(.*?)\">(?:<.*?>)?(.*?)(?:</font>)?</a></span></li>', i)
48     for j in m2:
49         urls.append(j[0])
50         titles.append(j[1])
51 print(len(urls))
52 for i in range(len(urls)):
53     print(u'读取中' + urls[i])
54     broswer.get(urls[i])
55     time.sleep(1)
56     strw2 = broswer.page_source
57     # print(strw2)
58     p2 = re.compile(r'https://\w+?.ithome.com/(?:html/)?(.*?)/.*?')
59     m2 = p2.findall(urls[i])
60     print(m2)
61     p = re.compile(u'<span id="pubtime_baidu">(\d*-\d*-\d*).*?</span><span id="source_baidu">'
62                    u'来源:<a href=".*?" .*?>(.*?)</a></span><span id="author_baidu">'
63                    u'作者:(?:<strong>)?(.*?)(?:</strong>)?</span>.*?<span id="commentcount">(.*?)</span>')
64     m = p.findall(strw2)
65     print(m)
66     if len(m) > 0:
67         days.append(m[0][0])
68         sources.append(m[0][1])
69         authors.append(m[0][2])
70         urls_new.append(urls[i])
71         comments.append(m[0][3])
72         titles_new.append(titles[i])
73         ty.append(m2[0])
74 print("读取结束")
75 data={'日期':days,'作者':authors,'来源':sources,'标题':titles_new,'链接':urls_new,'评论数量':comments,'新闻类型':ty}
76 df = pd.DataFrame(data, columns=['日期', '作者', '来源','标题','链接','评论数量','新闻类型'])
77 # print(df)
78 df.to_excel(r'ShuJuPa.xlsx',sheet_name='数据爬取结果',encoding='gb2312')