from bs4 import BeautifulSoupimport requestsurl = ‘http://dangjian.gmw.cn/node_11940.htm‘html = requests.get(url).content# prettify()用于格式化soup = BeautifulSoup(html, ‘lxml‘)# print(soup.prettify())# print(soup.find_all(‘span‘, class_="channel-newsTime"))resultSet = soup.find_all(‘ul‘, class_="channel-newsGroup")urls = set()for rs in resultSet: ???# url = rs.a[‘href‘] ???hrefs = rs.find_all(‘a‘) ???for href in hrefs: ???????url = href[‘href‘] ???????if url.startswith("http"): ???????????urls.add(url) ???????else: ???????????urls.add("http://dangjian.gmw.cn/"+url)print(urls)for url in urls: ???html = requests.get(url).content ???soup = BeautifulSoup(html, ‘lxml‘) ???title = soup.find(id="articleTitle").string ???# parts = soup.find(id="contentMain") ???parts = soup.select("div #contentMain > p") ???content = "" ???for part in parts: ???????content = content + part.string.__str__() ???print(title) ???print(content)
BeautifulSoup解析网页
原文地址:https://www.cnblogs.com/cord/p/9452950.html