BeautifulSoup
该模块用于接收一个HTML或XML字符串,然后将其进行格式化,之后遍可以使用他提供的方法进行快速查找指定元素,从而使得在HTML或XML中查找指定元素变得简单。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body> asdf
<div >
<b>The Dormouse‘s story总共</b>
<h1>f</h1>
</div> <div >Once upon a time there were three little sisters; and their names were
<a >Els<span>f</span>ie</a>,
<a href="http://example.com/lacie" >Lacie</a> and
<a href="http://example.com/tillie" >Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p >...</p> </body> </html> """ soup = BeautifulSoup(html_doc, features = "lxml" ) # 找到第一个a标签 tag1 = soup.find(name = ‘a‘ ) # 找到所有的a标签 tag2 = soup.find_all(name = ‘a‘ ) # 找到id=link2的标签 tag3 = soup.select( ‘#link2‘ ) |
使用示例:
1 2 3 4 5 6 7 8 9 10 11 | from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse‘s story</title></head> <body>
... </body> </html> """ soup = BeautifulSoup(html_doc, features = "lxml" ) |
1. name,标签名称
1 2 3 4 5 | # tag = soup.find(‘a‘) # name = tag.name # 获取 # print(name) # tag.name = ‘span‘ # 设置 # print(soup) |
2. attr,标签属性
1 2 3 4 5 6 | # tag = soup.find(‘a‘) # attrs = tag.attrs # 获取 # print(attrs) # tag.attrs = {‘ik‘:123} # 设置 # tag.attrs[‘id‘] = ‘iiiii‘ # 设置 # print(soup) |
3. children,所有子标签
1 2 | # body = soup.find(‘body‘) # v = body.children |
4. children,所有子子孙孙标签
1 2 | # body = soup.find(‘body‘) # v = body.descendants |
5. clear,将标签的所有子标签全部清空(保留标签名)
1 2 3 | # tag = soup.find(‘body‘) # tag.clear() # print(soup) |
6. decompose,递归的删除所有的标签
1 2 3 | # body = soup.find(‘body‘) # body.decompose() # print(soup) |
7. extract,递归的删除所有的标签,并获取删除的标签
1 2 3 | # body = soup.find(‘body‘) # v = body.extract() # print(soup) |
8.decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
1 2 3 4 | # body = soup.find(‘body‘) # v = body.decode() # v = body.decode_contents() # print(v) |
9.encode,转换为字节(含当前标签);encode_contents(不含当前标签)
1 2 3 4 | # body = soup.find(‘body‘) # v = body.encode() # v = body.encode_contents() # print(v) |
10.find,获取匹配的第一个标签
1 2 3 4 5 | # tag = soup.find(‘a‘) # print(tag) # tag = soup.find(name=‘a‘, attrs={‘class‘: ‘sister‘}, recursive=True, text=‘Lacie‘) # tag = soup.find(name=‘a‘, class_=‘sister‘, recursive=True, text=‘Lacie‘) # print(tag) |
11. find_all,获取匹配的所有标签
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | # tags = soup.find_all(‘a‘) # print(tags) # tags = soup.find_all(‘a‘,limit=1) # print(tags) # tags = soup.find_all(name=‘a‘, attrs={‘class‘: ‘sister‘}, recursive=True, text=‘Lacie‘) # # tags = soup.find(name=‘a‘, class_=‘sister‘, recursive=True, text=‘Lacie‘) # print(tags) # ####### 列表 ####### # v = soup.find_all(name=[‘a‘,‘div‘]) # print(v) # v = soup.find_all(class_=[‘sister0‘, ‘sister‘]) # print(v) # v = soup.find_all(text=[‘Tillie‘]) # print(v, type(v[0])) # v = soup.find_all(id=[‘link1‘,‘link2‘]) # print(v) # v = soup.find_all(href=[‘link1‘,‘link2‘]) # print(v) # ####### 正则 ####### import re # rep = re.compile(‘p‘) # rep = re.compile(‘^p‘) # v = soup.find_all(name=rep) # print(v) # rep = re.compile(‘sister.*‘) # v = soup.find_all(class_=rep) # print(v) # rep = re.compile(‘http://www.oldboy.com/static/.*‘) # v = soup.find_all(href=rep) # print(v) # ####### 方法筛选 ####### # def func(tag): # return tag.has_attr(‘class‘) and tag.has_attr(‘id‘) # v = soup.find_all(name=func) # print(v) # ## get,获取标签属性 # tag = soup.find(‘a‘) # v = tag.get(‘id‘) # print(v) |
12.has_attr,检查标签是否具有该属性
1 2 3 | # tag = soup.find(‘a‘) # v = tag.has_attr(‘id‘) # print(v) |
13.get_text,获取标签内部文本内容
1 2 3 | # tag = soup.find(‘a‘) # v = tag.get_text # print(v) |
14.index,检查标签在某标签中的索引位置
1 2 3 4 5 6 7 | # tag = soup.find(‘body‘) # v = tag.index(tag.find(‘div‘)) # print(v) # tag = soup.find(‘body‘) # for i,v in enumerate(tag): # print(i,v) |
15.is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
判断是否是如下标签:‘br‘ , ‘hr‘, ‘input‘, ‘img‘, ‘meta‘,‘spacer‘, ‘link‘, ‘frame‘, ‘base‘
1 2 3 | # tag = soup.find(‘br‘) # v = tag.is_empty_element # print(v) |
16. 当前的关联标签
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | # soup.next # soup.next_element # soup.next_elements # soup.next_sibling # soup.next_siblings # # tag.previous # tag.previous_element # tag.previous_elements # tag.previous_sibling # tag.previous_siblings # # tag.parent # tag.parents |
17. 查找某标签的关联标签
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | # tag.find_next(...) # tag.find_all_next(...) # tag.find_next_sibling(...) # tag.find_next_siblings(...) <
知识推荐
我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8
不良信息举报平台
互联网安全管理备案
Copyright 2023 www.wodecom.cn All Rights Reserved |