利用python爬取某壳的房产数据

时间:2024-05-05 20:00:16
import requests from pyquery import PyQuery as pq import json import pandas as pd import datetime,time columns = ['id','title','place','msg', 'price', 'per_meter','area','city'] areas=['滨湖区','梁溪区','新吴区','惠山区','锡山区','江阴市','宜兴市'] # 爬取某网页 def get_a_page(url,area): result = requests.get(url) doc = pq(result.text) ul = doc('.sellListContent') divs = ul.children('.clear .info.clear').items() count = 0 realids=[] titles = [] places = [] msgs = [] prices = [] per_meters = [] realarea=[] citys=[] for div in divs: count += 1 realid=div.children('.address .priceInfo .unitPrice').attr('data-hid') title = div.children('.title a').text() place = div.children('.address .flood .positionInfo a').text() msg = div.children('.address .houseInfo').text() price = div.children('.address .priceInfo .totalPrice span').text() per_meter = div.children('.address .priceInfo .unitPrice span').text() city='无锡' dict = { 'id':realid, 'title': title, 'place': place, 'msg': msg, 'price': price, 'per_meter': per_meter, 'area':areas[area], 'city':'无锡' } realids.append(realid) titles.append(title) places.append(place) msgs.append(msg) prices.append(price) per_meters.append(per_meter) realarea.append(areas[area]) citys.append(city) print(str(count) + ':' + json.dumps(dict, ensure_ascii=False)) datas={ 'id':realids, 'title': titles, 'place': places, 'msg': msgs, 'price': prices, 'per_meter': per_meters, 'area':realarea, 'city':citys } df = pd.DataFrame(data=datas, columns=columns) df.to_csv('wx'+time.strftime('%Y-%m-%d')+'.csv', mode='a', index=False, header=False) if __name__ == '__main__': quyu=['binhu','liangxi','xinwu','huishan','xishan','jiangyinshi','yixingshi'] index=0 for qy in quyu: #print (index) #print (qy) for i in range(1, 20): get_a_page(f'https://wx.ke.com/ershoufang/{qy}/pg{i}tt9/',index) index=index+1