Python实现爬取需要登录的网站完整示例

时间:2024-04-16 14:40:19
1 from selenium import webdriver
2 dirver = webdriver.Firefox()
3 dirver.get(\'https://music.douban.com/\')
4 for i in dirver.find_elements_by_css_selector(\'.new-albums .album-title\'):
5     print(i.text)
读取页面整合后的结果

 

 1 import requests
 2 from lxml import html
 3 # 创建 session 对象。这个对象会保存所有的登录会话请求。
 4 session_requests = requests.session()
 5 # 提取在登录时所使用的 csrf 标记
 6 login_url = "https://bitbucket.org/account/signin/?next=/"
 7 result = session_requests.get(login_url)
 8 tree = html.fromstring(result.text)
 9 authenticity_token = list(set(tree.xpath("//input[@name=\'csrfmiddlewaretoken\']/@value")))[0]
10 payload = {
11   "username": "<你的用户名>",
12   "password": "<你的密码>",
13   "csrfmiddlewaretoken": authenticity_token # 在源代码中,有一个名为 “csrfmiddlewaretoken” 的隐藏输入标签。
14 }
15 # 执行登录
16 result = session_requests.post(
17   login_url,
18   data = payload,
19   headers = dict(referer=login_url)
20 )
21 # 已经登录成功了,然后从 bitbucket dashboard 页面上爬取内容。
22 url = \'https://bitbucket.org/dashboard/overview\'
23 result = session_requests.get(
24   url,
25   headers = dict(referer = url)
26 )
27 # 测试爬取的内容
28 tree = html.fromstring(result.content)
29 bucket_elems = tree.findall(".//span[@class=\'repo-name\']/")
30 bucket_names = [bucket.text_content.replace("n", "").strip() for bucket in bucket_elems]
31 print(bucket_names)
View Code

 

 1 from bs4 import BeautifulSoup
 2 import requests
 3 
 4 class ****(object):
 5     def __init__(self, headers):
 6         self.session = requests.Session()
 7         self.headers = headers
 8     def get_webflow(self):
 9         url = \'http://passport.****.net/account/login\'
10         response = self.session.get(url=url, headers=self.headers)
11         soup = BeautifulSoup(response.text, \'html.parser\')
12         lt = soup.find(\'input\', {\'name\': \'lt\'})[\'value\']
13         execution = soup.find(\'input\', {\'name\': \'execution\'})[\'value\']
14         soup.clear()
15         return (lt, execution)
16     def login(self, account, password):
17         self.username = account
18         self.password = password
19         lt, execution = self.get_webflow()
20         data = {
21             \'username\': account,
22             \'password\': password,
23             \'lt\': lt,
24             \'execution\': execution,
25             \'_eventId\': \'submit\'
26         }
27         url = \'http://passport.****.net/account/login\'
28         response = self.session.post(url=url, headers=self.headers, data=data)
29         if (response.status_code == 200):
30             print(\'正常\')
31         else:
32             print(\'异常\')
33     def func(self):
34         headers1={
35             \'Host\':\'write.blog.****.net\',
36             \'Upgrade-Insecure-Requests\':\'1\',
37             \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36\'
38         }
39         response=self.session.get(url=\'http://write.blog.****.net/postlist\',headers=headers1,allow_redirects=False)
40         print(response.text)
41 if __name__ == \'__main__\':
42     headers = {
43         \'Host\': \'passport.****.net\',
44         \'Origin\': \'http://passport.****.net\',
45         \'Referer\':\'http://passport.****.net/account/login\',
46         \'Upgrade-Insecure-Requests\':\'1\',
47         \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\',
48     }
49     **** = ****(headers=headers)
50     account = \'\'
51     password = \'\'
52     ****.login(account=account, password=password)
53     ****.func()
View Code
 1 #coding=utf-8  
 2 import requests  
 3 import re  
 4 import time  
 5 import json  
 6 from bs4 import BeautifulSoup as BS  
 7 import sys 
 8   
 9 headers = {  
10     \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36\',  
11 }  
12   
13 def Get_Movie_URL():  
14     urls = []  
15     for i in range(1,11):  
16         # 第一页的URL是不一样的,需要另外进行处理  
17         if i != 1:  
18             url = "http://www.mtime.com/top/movie/top100/index-%d.html" % i  
19         else:  
20             url = "http://www.mtime.com/top/movie/top100/"  
21         r = requests.get(url=url,headers=headers)  
22         soup = BS(r.text,\'lxml\')  
23         movies = soup.find_all(name=\'a\',attrs={\'target\':\'_blank\',\'href\':re.compile(\'http://movie.mtime.com/(\d+)/\'),\'class\':not None})  
24         for m in movies:  
25             urls.append(m.get(\'href\'))  
26     return urls  
27   
28 def Create_Ajax_URL(url):  
29     movie_id = url.split(\'/\')[-2]  
30     t = time.strftime("%Y%m%d%H%M%S0368", time.localtime())  
31     ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % (url,t,movie_id)  
32     return ajax_url  
33   
34 def Crawl(ajax_url):  
35     r = requests.get(url=ajax_url,headers=headers)  
36     if r.status_code == 200:  
37         r.encoding = \'utf-8\'  
38         result = re.findall(r\'=(.*?);\',r.text)[0]  
39         if result is not None:  
40             value = json.loads(result)  
41   
42             movieTitle = value.get(\'value\').get(\'movieTitle\')  
43             TopListName = value.get(\'value\').get(\'topList\').get(\'TopListName\')  
44             Ranking = value.get(\'value\').get(\'topList\').get(\'Ranking\')  
45             movieRating = value.get(\'value\').get(\'movieRating\')  
46             RatingFinal = movieRating.get(\'RatingFinal\')  
47             RDirectorFinal = movieRating.get(\'RDirectorFinal\')  
48             ROtherFinal = movieRating.get(\'ROtherFinal\')  
49             RPictureFinal = movieRating.get(\'RPictureFinal\')  
50             RStoryFinal = movieRating.get(\'RStoryFinal\')  
51             print(movieTitle)  
52             if value.get(\'value\').get(\'boxOffice\'):  
53                 TotalBoxOffice = value.get(\'value\').get(\'boxOffice\').get(\'TotalBoxOffice\')  
54                 TotalBoxOfficeUnit = value.get(\'value\').get(\'boxOffice\').get(\'TotalBoxOfficeUnit\')  
55                 print(\'票房:%s%s\' % (TotalBoxOffice,TotalBoxOfficeUnit))  
56             print(\'%s——No.%s\' % (TopListName,Ranking))  
57             print(\'综合评分:%s 导演评分:%s 画面评分:%s 故事评分:%s 音乐评分:%s\' %(RatingFinal,RDirectorFinal,RPictureFinal,RStoryFinal,ROtherFinal))  
58             print(\'****\' * 20)  
59   
60 def main():  
61     urls = Get_Movie_URL()  
62     for u in urls:  
63         Crawl(Create_Ajax_URL(u))  
64   
65     # 问题所在,请求如下单个电影链接时时不时会爬取不到数据  
66     # Crawl(Create_Ajax_URL(\'http://movie.mtime.com/98604/\'))  
67   
68 if __name__ == \'__main__\':  
69     main() 
View Code

 

 相关工具

链接: https://pan.baidu.com/s/1oEw_MsaAWcMx7NQII6jXYg 密码: e6b6

 

链接: https://pan.baidu.com/s/1fSppM-hK2x9Jk9RGqvRMqg 密码: 4q43