1 from selenium import webdriver 2 dirver = webdriver.Firefox() 3 dirver.get(\'https://music.douban.com/\') 4 for i in dirver.find_elements_by_css_selector(\'.new-albums .album-title\'): 5 print(i.text)
1 import requests 2 from lxml import html 3 # 创建 session 对象。这个对象会保存所有的登录会话请求。 4 session_requests = requests.session() 5 # 提取在登录时所使用的 csrf 标记 6 login_url = "https://bitbucket.org/account/signin/?next=/" 7 result = session_requests.get(login_url) 8 tree = html.fromstring(result.text) 9 authenticity_token = list(set(tree.xpath("//input[@name=\'csrfmiddlewaretoken\']/@value")))[0] 10 payload = { 11 "username": "<你的用户名>", 12 "password": "<你的密码>", 13 "csrfmiddlewaretoken": authenticity_token # 在源代码中,有一个名为 “csrfmiddlewaretoken” 的隐藏输入标签。 14 } 15 # 执行登录 16 result = session_requests.post( 17 login_url, 18 data = payload, 19 headers = dict(referer=login_url) 20 ) 21 # 已经登录成功了,然后从 bitbucket dashboard 页面上爬取内容。 22 url = \'https://bitbucket.org/dashboard/overview\' 23 result = session_requests.get( 24 url, 25 headers = dict(referer = url) 26 ) 27 # 测试爬取的内容 28 tree = html.fromstring(result.content) 29 bucket_elems = tree.findall(".//span[@class=\'repo-name\']/") 30 bucket_names = [bucket.text_content.replace("n", "").strip() for bucket in bucket_elems] 31 print(bucket_names)
1 from bs4 import BeautifulSoup 2 import requests 3 4 class ****(object): 5 def __init__(self, headers): 6 self.session = requests.Session() 7 self.headers = headers 8 def get_webflow(self): 9 url = \'http://passport.****.net/account/login\' 10 response = self.session.get(url=url, headers=self.headers) 11 soup = BeautifulSoup(response.text, \'html.parser\') 12 lt = soup.find(\'input\', {\'name\': \'lt\'})[\'value\'] 13 execution = soup.find(\'input\', {\'name\': \'execution\'})[\'value\'] 14 soup.clear() 15 return (lt, execution) 16 def login(self, account, password): 17 self.username = account 18 self.password = password 19 lt, execution = self.get_webflow() 20 data = { 21 \'username\': account, 22 \'password\': password, 23 \'lt\': lt, 24 \'execution\': execution, 25 \'_eventId\': \'submit\' 26 } 27 url = \'http://passport.****.net/account/login\' 28 response = self.session.post(url=url, headers=self.headers, data=data) 29 if (response.status_code == 200): 30 print(\'正常\') 31 else: 32 print(\'异常\') 33 def func(self): 34 headers1={ 35 \'Host\':\'write.blog.****.net\', 36 \'Upgrade-Insecure-Requests\':\'1\', 37 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36\' 38 } 39 response=self.session.get(url=\'http://write.blog.****.net/postlist\',headers=headers1,allow_redirects=False) 40 print(response.text) 41 if __name__ == \'__main__\': 42 headers = { 43 \'Host\': \'passport.****.net\', 44 \'Origin\': \'http://passport.****.net\', 45 \'Referer\':\'http://passport.****.net/account/login\', 46 \'Upgrade-Insecure-Requests\':\'1\', 47 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\', 48 } 49 **** = ****(headers=headers) 50 account = \'\' 51 password = \'\' 52 ****.login(account=account, password=password) 53 ****.func()
1 #coding=utf-8 2 import requests 3 import re 4 import time 5 import json 6 from bs4 import BeautifulSoup as BS 7 import sys 8 9 headers = { 10 \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36\', 11 } 12 13 def Get_Movie_URL(): 14 urls = [] 15 for i in range(1,11): 16 # 第一页的URL是不一样的,需要另外进行处理 17 if i != 1: 18 url = "http://www.mtime.com/top/movie/top100/index-%d.html" % i 19 else: 20 url = "http://www.mtime.com/top/movie/top100/" 21 r = requests.get(url=url,headers=headers) 22 soup = BS(r.text,\'lxml\') 23 movies = soup.find_all(name=\'a\',attrs={\'target\':\'_blank\',\'href\':re.compile(\'http://movie.mtime.com/(\d+)/\'),\'class\':not None}) 24 for m in movies: 25 urls.append(m.get(\'href\')) 26 return urls 27 28 def Create_Ajax_URL(url): 29 movie_id = url.split(\'/\')[-2] 30 t = time.strftime("%Y%m%d%H%M%S0368", time.localtime()) 31 ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % (url,t,movie_id) 32 return ajax_url 33 34 def Crawl(ajax_url): 35 r = requests.get(url=ajax_url,headers=headers) 36 if r.status_code == 200: 37 r.encoding = \'utf-8\' 38 result = re.findall(r\'=(.*?);\',r.text)[0] 39 if result is not None: 40 value = json.loads(result) 41 42 movieTitle = value.get(\'value\').get(\'movieTitle\') 43 TopListName = value.get(\'value\').get(\'topList\').get(\'TopListName\') 44 Ranking = value.get(\'value\').get(\'topList\').get(\'Ranking\') 45 movieRating = value.get(\'value\').get(\'movieRating\') 46 RatingFinal = movieRating.get(\'RatingFinal\') 47 RDirectorFinal = movieRating.get(\'RDirectorFinal\') 48 ROtherFinal = movieRating.get(\'ROtherFinal\') 49 RPictureFinal = movieRating.get(\'RPictureFinal\') 50 RStoryFinal = movieRating.get(\'RStoryFinal\') 51 print(movieTitle) 52 if value.get(\'value\').get(\'boxOffice\'): 53 TotalBoxOffice = value.get(\'value\').get(\'boxOffice\').get(\'TotalBoxOffice\') 54 TotalBoxOfficeUnit = value.get(\'value\').get(\'boxOffice\').get(\'TotalBoxOfficeUnit\') 55 print(\'票房:%s%s\' % (TotalBoxOffice,TotalBoxOfficeUnit)) 56 print(\'%s——No.%s\' % (TopListName,Ranking)) 57 print(\'综合评分:%s 导演评分:%s 画面评分:%s 故事评分:%s 音乐评分:%s\' %(RatingFinal,RDirectorFinal,RPictureFinal,RStoryFinal,ROtherFinal)) 58 print(\'****\' * 20) 59 60 def main(): 61 urls = Get_Movie_URL() 62 for u in urls: 63 Crawl(Create_Ajax_URL(u)) 64 65 # 问题所在,请求如下单个电影链接时时不时会爬取不到数据 66 # Crawl(Create_Ajax_URL(\'http://movie.mtime.com/98604/\')) 67 68 if __name__ == \'__main__\': 69 main()
相关工具
链接: https://pan.baidu.com/s/1oEw_MsaAWcMx7NQII6jXYg 密码: e6b6
链接: https://pan.baidu.com/s/1fSppM-hK2x9Jk9RGqvRMqg 密码: 4q43