分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 运营维护

爬取房天下整个网站房产数据。。。

发布时间:2023-09-06 01:21责任编辑:熊小新关键词:暂无标签

以前爬的数据量都有点少了,所以现在写个爬全站数据爬虫来,用redis进行URL的去重处理,采用mysql储存清洗过后房产数据,采用线程池来进行调度,进行多线程爬取

下面是房天下所有地区二手房和新房的URL,为后续爬取提供起始URL:

 1 import requests 2 from lxml import etree 3 ?4 ?5 class Ftx_newhouse_Secondhandhouse(object): 6 ?7 ????headers = { 8 ????????‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘, 9 ????????‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘10 11 ????}12 13 ????def __init__(self):14 ????????self.url = ‘http://newhouse.fang.com/house/s/‘15 ????????self.s = requests.session()16 17 18 ????def Newhouse_ftx(self):19 ????????try:20 ????????????response = self.s.post(self.url,headers=self.headers,verify=False)21 ????????except Exception as e:22 ????????????print(‘error:‘,e)23 ????????response.encoding = ‘gb2312‘24 ????????urls = etree.HTML(response.text)25 ????????xf_adress = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/text()|‘26 ????????????????????????????‘//div[@class="city20141104"]/div[4]/a/text()|‘27 ????????????????????????????‘//div[@class="city20141104"]/div[5]/a/text()‘28 ????????????????????????????)29 ????????xf_url = urls.xpath(‘//div[@class="city20141104"]/div[3]/a/@href|‘30 ????????????????????????????‘//div[@class="city20141104"]/div[4]/a/@href|‘31 ????????????????????????????‘//div[@class="city20141104"]/div[5]/a/@href‘32 ????????????????????????????)33 34 ????????return (dict(zip(xf_adress,xf_url)))35 36 ????def Secondhandhouse_ftx(self):37 ????????self.url = ‘http://esf.sh.fang.com/newsecond/esfcities.aspx‘38 ????????try:39 ????????????html ?= requests.get(self.url,headers=self.headers,timeout=4)40 ????????except Exception as e:41 ????????????print(‘error:‘,e)42 ????????html.encoding = ‘gb2312‘43 ????????Secondhandhouse_urls = etree.HTML(html.text)44 ????????xf_url = Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/text()‘)45 ????????xf_adress ?= Secondhandhouse_urls.xpath(‘//div[@class="onCont"]/ul/li/a/@href‘)46 ????????dictx = dict(zip(xf_url,xf_adress))47 ????????return dictx




下面是爬取房产数据代码:

 1 import requests,redis,pymysql 2 from mywed.fangtianxia.url import Ftx_newhouse_Secondhandhouse 3 from lxml import etree 4 from concurrent.futures import ThreadPoolExecutor 5 import re,os,time 6 from mywed.fangtianxia.logs import log_run 7 ?8 Secondhandhouse_urls_set = {‘http://esf.hbjs.fang.com‘} 9 dr = Ftx_newhouse_Secondhandhouse()10 w = dr.Secondhandhouse_ftx()11 for i in w.values():12 ????Secondhandhouse_urls_set.add(i)13 print(Secondhandhouse_urls_set)14 15 16 17 class Secondhandhouse(object):18 19 ????headers = {20 ????????‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘,21 ????????‘Cookie‘: ‘global_cookie=5n55ylc24xzrdp58gka2fm0mx2lj4mqfqak; Integrateactivity=notincludemc; vh_newhouse=3_1499483589_17454%5B%3A%7C%40%7C%3A%5D9af16b0d610e2cdd596b0d5a35400fbd; newhouse_user_guid=925B3734-6802-3162-165C-B593DAA860F1; recentViewlpNew_newhouse=3_1502607112_9948%5B%3A%7C%40%7C%3A%5D54e263288e4374965795dfe7c94c7fd3; city=heyuan; polling_imei=232d98985399f89e; token=59c66a51681142018630f1745e1e739f; Captcha=6E6B7334505855746454384A743161514A46696B346D577833476C613647745662647355494E7570596D4C52612B564F45473832462B59674B5A6E504C63386A34614767326774426455773D; __utmt_t0=1; __utmt_t1=1; __utmt_t2=1; sfut=33A48A581B218095B1D7CE492BDDCA86292F2A06B82634CBDD1201D2545F42EE4B54A2BC1247390DE02741E7CA2C9A911EA425B693C59EC2D62EDD7A4D70012C0F8DEE007CB20A5E2A74C8A9B17D4A8E3A7698ADDEAEC479D29D9DC82BC746FB; passport=usertype=1&userid=100371905&username=huangsonghui&password=&isvalid=1&validation=; agent_validation=a=0; __utma=147393320.331855580.1499000907.1504415980.1508935988.27; __utmb=147393320.49.10.1508935988; __utmc=147393320; __utmz=147393320.1508935988.27.21.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; unique_cookie=U_35b7j0utahefmuagw4fol4w8y1bj971iz3h*14‘22 23 ????}24 25 ????def get_newhouse_data(self,url):26 27 ????????for num in range(102):28 ????????????second_url = url + ‘/house/i3‘ + str(num)29 ????????????try:30 ????????????????while True:31 ????????????????????reponse = requests.get(url,headers=self.headers,timeout=3)32 ????????????????????reponse.encoding = ‘gbk‘33 ????????????????????#print(reponse.text)34 ????????????????????if reponse.status_code ==200:35 ????????????????????????break36 ????????????????????else:37 ????????????????????????print(‘restart donwing ......‘)38 ????????????except Exception as e:39 ????????????????log_run.File_enter_error(e)40 ????????????select = etree.HTML(str(reponse.text))41 42 ????????????if not len(select.xpath(‘//a[@id="PageControl1_hlk_next"]/text()‘)):43 ????????????????break44 ????????????else:45 ????????????????content_list = select.xpath(‘//dd[@class="info rel floatr"]‘)46 ????????????????#print(content_list)47 48 ????????????????for i in content_list:49 ????????????????????title = i.xpath(‘./p[1]/a/@title‘)50 ????????????????????content = i.xpath(‘./p[2]/text()‘)51 ????????????????????name = i.xpath(‘./p[3]/a/span/text()‘)52 ????????????????????adress = i.xpath(‘./p[3]/span/text()‘)53 ????????????????try:54 ????????????????????size_list = select.xpath(‘//div[@class="area alignR"]‘)55 ????????????????????size = [ii.xpath(‘./p/text()‘) for ii in size_list]56 ????????????????????average_price_list = select.xpath(‘//p[@class="danjia alignR mt5"]‘)57 ????????????????????average_price = [‘/‘.join(iii.xpath(‘./text()‘)) for iii in average_price_list]58 ????????????????????sum_price_list = select.xpath(‘//p[@class="mt5 alignR"]‘)59 ????????????????????sum_price = [‘‘.join(iiii.xpath(‘./span/text()‘)) for iiii in sum_price_list]60 ????????????????except Exception as e:61 ????????????????????log_run.File_enter_error(e)62 ????????????????print(title)63 64 65 if __name__ =="__main__":66 ????t = Secondhandhouse()67 ????t.get_newhouse_data(‘http://esf.fang.com/house/i33/‘)68 ????#s = t.get_newhouse_data69 ????#pool = ThreadPoolExecutor(30)70 ????#f = pool.map(s,Secondhandhouse_urls_set)


爬取房天下整个网站房产数据。。。

原文地址:http://www.cnblogs.com/Huangsh2017Come-on/p/7750417.html

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved