# -*- coding: utf-8 -*- # @Time : 2017/9/5 10:58 # @Author : z # @File : 房天下.py # @Software: PyCharm import requests from bs4 import BeautifulSoup import re import threadpool class House(object): def __init__(self): pass def get_url(self,url): response = requests.get(url) response.encoding = response.apparent_encoding self.html = response.text #解析数据,拼接初始url def join_url(self): # url = 'http://js.soufunimg.com/homepage/new/family/css/citys.js?v=20170520' # response=eval(requests.get(url).text[14:-1]) # for ur in response: # self.gain_num(ur['name'],'http://newhouse.'+ur['url'].split('.')[0].split('//')[1]+'.fang.com/house/s/') self.gain_num('上海', 'http://newhouse.sh.fang.com/house/s/') #获取最大数。循环出新url def gain_num(self, city, city_url): self.get_url(city_url) max_num =re.findall('\d+',re.findall(re.compile('<span class="ff3333">.*?</span>/(.*?)</span>', re.S), self.html)[0])[0] for num in range(1,int(max_num)+1): # print('++++++++++++++++++++++++++++++++++++++{}++++++++++++++++++++++++++++++++++++++++++++++'.format(num)) url = city_url+'b9'+str(num) self.parse_page(city,url,num) #xianccccccccccc # list = [city_url + 'b9' + str(num) for num in range(1, int(1) + 1) # # url = city_url+'b9'+str(num) # # thind = threadpool.ThreadPool # t = threadpool.makeRequests(self.parse_page, (city, list)) #解析首页,获取城市名, def parse_page(self,city,city_url,num): self.get_url(city_url) soup = BeautifulSoup(self.html,'lxml') for con in soup.find('div',id='newhouse_loupai_list').find('ul').find_all('li'): house =con.find('div','nlc_details').find('div','nlcd_name') price = con.find('div', 'nlc_details').find('div', 'nhouse_price') district_name =con.find('div','nlc_details').find('div','address').get_text().strip().split(']')[0].replace('\t','').replace('[','') house_url = house.a['href'] project_name= house.a.get_text().strip() if price: project_price=price.find('span').get_text() self.parse_detail(house_url,city,district_name,project_name,project_price,num) def parse_detail(self,city_url,city_name,district_name,project_name,project_price,num): self.get_url(city_url) detail = BeautifulSoup(self.html,'lxml').find('div',id='orginalNaviBox').find_all('a')[1]['href'] self.get_url(detail) soup = BeautifulSoup(self.html,'lxml') #评分 # grade = soup.find('div','main-info-comment').find_all('span')[2].get_text() # #点评数 # num = soup.find('div','main-info-comment').find_all('span')[3].get_text()[1:-1] content = soup.find_all('div', 'main-item') #基本信息 print('------------------------------------1-----------------------------------------------') print('***********************') print('第{}页'.format(num)) print(city_name) print(project_name) print(district_name) print(project_price) print('***********************') clearfix=content[0].find('ul','list clearfix').find_all('li') project_type =clearfix[0].get_text().strip().replace('\n','').replace('\t','') project_point =','.join([te.get_text() for te in clearfix[1].find('div','list-right').find_all('span','tag')]) project_buildform =clearfix[2].get_text().replace('\n','').replace('\t','') project_fixture =clearfix[3].get_text().strip().replace('\n','').replace('\t','') project_ownlife =clearfix[4].get_text().strip().replace('\n','').replace('\t','') line_name =clearfix[5].get_text().strip().replace('\n','').replace('\t','') developer_name =clearfix[6].get_text().strip().replace('\n','').replace('\t','') project_address =clearfix[7].get_text().strip().replace('\n','').replace('\t','') print(project_type) print('项目特色:'+project_point) print(project_buildform) print(project_fixture) print(project_ownlife) print(line_name) print(developer_name) print(project_address) print('') #销售信息 list1 =content[1].find('ul','list clearfix').find_all('li') table = content[1].find('div','main-table') project_salestatu =list1[0].get_text().replace('\n','').replace('\t','').strip() project_discount =list1[1].get_text().replace('\n','').replace('\t','').strip() project_opendate=list1[2].get_text().replace('\n','').replace('\t','').strip() project_getdate=list1[3].get_text().replace('\n','').replace('\t','').strip() project_office=list1[4].get_text().replace('\n','').replace('\t','').strip() project_roomclass=list1[6].get_text().replace('\n','').replace('\t','').strip() print(project_salestatu) print(project_discount) print(project_opendate) print(project_getdate) print(project_roomclass) # print('-------------1-------------------') if table: for i in table.find_all('table'): for td in i.find_all('tr')[1:]: presale_name1 = td.find_all('td')[0].get_text() presale_date1 = td.find_all('td')[1].get_text() presale_build = td.find_all('td')[2].get_text() print('预售许可证: '+presale_name1) print('发证时间: '+presale_date1) print('绑定楼栋: '+presale_build) print(' ') #配套信息 try: project_traffic = content[2].find('div','set').get_text().replace('\n','').replace('\br','').replace(' ','').strip() project_support = content[2].find('div','set bd-1').find('p').get_text().replace('\n','').replace('\br','').replace('\t','').strip() print('交通配套:' + project_traffic) print('项目配套:' + project_support) except: pass print(' ') #小区规划 project = content[3].find('ul','clearfix list').find_all('li') project_allarea = project[0].get_text().replace('\n','').replace('\t','').strip() project_buildarea = project[1].get_text().replace('\n','').replace('\t','').strip() project_volume = project[2].get_text().replace('\n','').replace('\t','').strip() project_green = project[3].get_text().replace('\n','').replace('\t','').strip() project_carpark = project[4].get_text().replace('\n','').replace('\t','').strip() build_number = project[5].get_text().replace('\n','').replace('\t','').strip() project_household = project[6].get_text().replace('\n','').replace('\t','').strip() manager_name = project[7].get_text().replace('\n','').replace('\t','').strip() manager_price = project[8].get_text().replace('\n','').replace('\t','').strip() build_floor = project[9].get_text().replace('\n','').replace('\t','').strip() print(project_allarea) print(project_buildarea) print(project_volume) print(project_green) print(build_number) print(project_household) print(manager_name) print(manager_price) print(build_floor) print(' ') #价格信息 price = content[4].find_all('table') for tr in price: for td in tr.find_all('tr')[1:]: # print(td) price_record = td.find_all('td')[0].get_text() price_value = td.find_all('td')[1].get_text() price_low = td.find_all('td')[2].get_text() price_caption = td.find_all('td')[3].get_text() print('记录时间:'+price_record) print('均价:'+price_value) print('起价:'+price_low) print('价格描述:'+price_caption) print(' ') #项目简介 project_caption = content[5].find('p','intro').get_text().strip() print('项目简介:'+project_caption) print(' ') print('---------------------------------------2-------------------------------------------') print(' ') if __name__ == '__main__':
House().join_url()
(代码有点瑕疵,,如果有更好的解决方式,,联系我!!共同进步!!)
代码