https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| import urllib3 from bs4 import BeautifulSoup
http = urllib3.PoolManager() res = http.request('GET','http://www.baidu.com') soup = BeautifulSoup(res.data.decode('utf-8'),'html.parser')
print(soup.prettify())
print(soup.title)
print(soup.title.name)
print(soup.title.text) print(soup.title.string)
print(soup.title.parent.name)
print(soup.div)
print(soup.div['id'])
print(soup.find_all('div'))
print(soup.find(id='c-tips-container'))
for link in soup.find_all('div'): print(link.get('id'))
print(soup.get_text())
|
解析器安装
lxml
1 2 3 4 5
| pip install lxml
BeautifulSoup(markup, "lxml") BeautifulSoup(markup, ["lxml-xml"]) BeautifulSoup(markup, "xml")
|
html5lib
1 2 3
| pip install html5lib
BeautifulSoup(markup, "html5lib")
|