BeautifulSoup - Python library
Beautiful soup helps to analyze structured documents, such as HTML and XML.
environment
- python: 3.9.0
Preparation
- install beautiful soup `pip install beautifulsoup4’
- import beautiful soup
import BeautifulSoup4
Beautiful Soup overview
from bs4 import BeautifulSoup
def print_section_title(title):
print("==============================================")
print("= " + title)
print("==============================================")
# HTML text you want to analyze. You can put partial HTML.
html = """
<html>
<head>
<title>title text</title>
</head>
<body>
<h1 class="heading" name="heading">first header</h1>
<p id="first_sentence">first</p>
<p name="sentence 2">second</p>
<p id="third_sentence" data-1="one" data-2="2">third</p>
<p name="sentence 4">final</p>
</body>
</html>
"""
# To analyze HTML, put 'html.parser' to 2nd argument
soup = BeautifulSoup(html, 'html.parser')
# confirm analysis correctnes
print(soup.prettify())
# find elements with soup structure
print_section_title("find elements with soup structure")
# get title element
title = soup.html.head.title
print("title: " + title.string)
# get first p element
p = soup.html.body.p
print("first p: " + p.string)
# get LF and spaces just after the p
blank = p.next_sibling
print("p sibling: " + blank)
# get 2nd p element
p = blank.next_sibling
print("second p: " + p.string)
# find elements by tag name
print_section_title("find elements by tag name")
p_elements = soup.find_all("p")
for p in p_elements:
# access to the attribute
if 'id' in p.attrs:
id = p['id']
else:
id = '(no id)'
text = p.string
print(id + ' : ' + text)
# find element with attribute
print_section_title("find elements with attribute")
h1 = soup.find(name="heading")
p = soup.find('p', id="first_sentence")
print("p with id: " + p.string)
# find element with complexed condition
condition = {'data-1': 'one', 'data-2': 2}
p = soup.find('p', condition)
print("p with condition: " + p.string)
# find elements by CSS selector
print_section_title("find elements with CSS selector")
# find one element
heading = soup.select_one("body > h1.heading")
print("heading: " + heading.string)
p = soup.select_one("p[id='first_sentence']")
print("p: " + p.string)
# find multiple element
p_elements = soup.select("body > p")
for p in p_elements:
print(p.string)
# Beautiful Soup can collaborate with regular expression
print_section_title("find elements with regular expression")
import re
exp = re.compile(r'e$')
p_elements = soup.find_all(id=exp)
for p in p_elements:
print(p.string)
Output
<html>
<head>
<title>
title text
</title>
</head>
<body>
<h1 class="heading" name="heading">
first header
</h1>
<p id="first_sentence">
first
</p>
<p name="sentence 2">
second
</p>
<p data-1="one" data-2="2">
third
</p>
<p name="sentence 4">
final
</p>
</body>
</html>
==============================================
= find elements with soup structure
==============================================
title: title text
first p: first
p sibling:
second p: second
==============================================
= find elements by tag name
==============================================
first_sentence : first
(no id) : second
(no id) : third
(no id) : final
==============================================
= find elements with attribute
==============================================
p with id: first
p with condition: third
==============================================
= find elements with CSS selector
==============================================
heading: first header
p: first
first
second
third
final
==============================================
= find elements with regular expression
==============================================
first