External Exam Download Resources Web Applications Games Recycle Bin

BS4 Functions

bs4 functions.py

from bs4 import BeautifulSoup
import urllib.request
url = "https://digisoln.com/python/webscraping/timetable.html"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, features="html.parser")

print("raw:", soup) #unindented raw with html
print("prettify:", soup.prettify()) #indents / nested data struct
print("text only:", soup.get_text()) #no html
print("title:", soup.title) #<title>Timetable</title>
print("title.string:", soup.title.string) #Timetable

for td in soup.find_all("td"):
    print(td) #<td>English <span class="room">B2</span></td> ETC..
    
print("a:", soup.a) #first (only) link eg <a href="...">...</a>

for link in soup.find_all("a"):
    print(link) #ALL links eg <a href="...">...</a>
    print(link.get('href')) #url only eg https://www...

room = soup.find("span", {"class": "room"})
print(room) #<span class="room">A1</span> = first room

for rooms in soup.find_all("span", {"class": "room"}):
    print(rooms) #same output as previous but all rooms
    print(rooms.get_text()) #strips HTML

#more docs - https://www.crummy.com/software/BeautifulSoup/bs4/doc/

BS4 Documentation