Python爬取美团商家信息

时间:2024-02-24 08:51:59

抓取美团商家信息

 1 import requests
 2 from bs4 import BeautfulSoup
 3 import json
 4 
 5 #地区路径
 6 url = \'http://km.meituan.com/\'
 7 
 8 url_shop = \'http://km.meituan.com/shop/{}\'
 9 
10 #请求头
11 headers = {
12         
13     \'Accept\':\'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\',
14     \'Accept-Encoding\':\'gzip, deflate, sdch\',
15     \'Accept-Language\':\'zh-CN,zh;q=0.8\',
16     \'Cache-Control\':\'max-age=0\',
17     \'DNT\':\'1\',
18     \'Host\':\'bj.meituan.com\',
19     \'Proxy-Connection\':\'keep-alive\',
20     \'Referer\':\'http://bj.meituan.com/shop/286725?acm=UwunyailsW15518532529028663069.286725.1&mtt=1.index%2Fdefault%2Fpoi.pz.1.j4cijrmg&cks=58899\',
21     \'Upgrade-Insecure-Requests\':\'1\',
22     \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
23          
24 }
25 
26 #得到所有的二级菜单头
27 def get_start_menu_links():
28     html = requests.get(url).text
29     soup = BeautifulSoup(html, \'lxml\')
30     links = [link.find(\'div\').find(\'div\').find(\'dl\').find(\'dt\').find(\'a\')[\'href\'] for link in soup.find_all(\'div\',class_=\'J-nav-item\') ]
31     return links
32 
33 
34 def get_shop_ids(url, headers=None):
35     html = requests.get(url, headers=headers).text
36     soup = BeautifulSoup(html, \'lxml\')
37     content_id = json.loads(soup.find(\'div\', class_=\'J-scrollloader cf J-hub\')[\'data-async-params\'])
38     return json.loads(content_id.get(\'data\')).get(\'poiidList\')
39 
40 
41 def main():
42     start_menu_links = get_start_menu_links()
43     for link in start_menu_links:
44         for pageNum in range(4,5):
45             category_url = link + \'/all/page{}\'.format(pageNum)
46             for shop_id in get_shop_ids(category_url, headers=headers):
47                 html = requests.get(url_shop.format(shop_id), headers=headers).text
48                 soup = BeautifulSoup(html, \'lxml\')
49                 shop_detail = soup.find(\'div\', class_=\'summary biz-box fs-section cf\')
50                 print("==================================pageNum %d  shop_id: %d===================================================" % (pageNum,shop_id ))
51                 try:
52                     shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'h2\').find(\'span\').text
53                 except:
54                     continue
55                 print("名称:      " + shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'h2\').find(\'span\').text)
56                 print("地址:      " + shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'p\', class_=\'under-title\').find(\'span\').text)
57                 print("联系方式:   " + shop_detail.find(\'div\', class_=\'fs-section__left\').find(\'p\', class_=\'under-title\').find_next_sibling().text)
58 
59 
60 if \'__main__\' == __name__:
61     main()
62 

 

本文转载于https://www.cnblogs.com/zyndev/p/7612989.html