import reimport jsonfrom urllib.request import urlopenimport ssl# ?掉数字签名证书ssl._create_default_https_context = ssl._create_unverified_contextershoufang_url=‘https://bj.lianjia.com/ershoufang/rs/‘def get_html_content(url): ???html=urlopen(url) ???content=html.read().decode(‘utf-8‘) ???# print(content) ???return contentdef chuli(content): ???obj=re.compile(r‘<span.*?>关注</span></div><div.*?><span></span></div><div.*?><span></span></div><div class="price"><span>(?P<price>.*?)</span>万</div></a><a.*?>(?P<title>.*?)</a><div class="info">.*?<span>/</span>.*?<span>/</span>(?P<pingmi>.*?)<span>/</span>(?P<fangxiang>.*?)<span>/</span>(?P<zhuangxiu>.*?)</div><div .*?>(?:<span .*?>.*?</span>)?<span.*?>(?P<fangben>.*?)</span>‘,re.S) ???it=obj.finditer(content) ???for el in it: ???????yield { ???????????‘价格:‘:el.group(‘price‘)+‘万‘, ???????????‘房屋信息:‘:el.group(‘title‘), ???????????‘平米数:‘:el.group(‘pingmi‘), ???????????‘朝向‘:el.group(‘fangxiang‘), ???????????‘装修:‘:el.group(‘zhuangxiu‘).replace(‘<span>/</span>‘,‘,‘), ???????????‘房本信息:‘:el.group(‘fangben‘).replace(‘随时看房‘,‘无信息‘).replace(‘关注‘,‘无信息‘), ???????}def xieru(jieguo): ???txt=json.dumps(jieguo,ensure_ascii=False) ???with open(‘houseInfo‘,mode=‘a‘,encoding=‘utf-8‘)as f: ???????f.write(txt+‘\n‘)def main(): ???for i in range(1,101): ???????if i ==1: ???????????new_content = get_html_content(ershoufang_url) ???????else: ???????????dong_url=‘https://bj.lianjia.com/ershoufang/pg%d/‘%i ???????????new_content = get_html_content(dong_url) ???????ret = chuli(new_content) ???????for el in ret: ???????????xieru(el) ???????????print(el)if __name__==‘__main__‘: ???main()
爬虫链家网站获取信息
原文地址:https://www.cnblogs.com/PythonMrChu/p/9785661.html