BS4 Functions
bs4 functions.py
from bs4 import BeautifulSoup import urllib.request url = "https://digisoln.com/python/webscraping/timetable.html" page = urllib.request.urlopen(url) soup = BeautifulSoup(page, features="html.parser") print("raw:", soup) #unindented raw with html print("prettify:", soup.prettify()) #indents / nested data struct print("text only:", soup.get_text()) #no html print("title:", soup.title) #<title>Timetable</title> print("title.string:", soup.title.string) #Timetable for td in soup.find_all("td"): print(td) #<td>English <span class="room">B2</span></td> ETC.. print("a:", soup.a) #first (only) link eg <a href="...">...</a> for link in soup.find_all("a"): print(link) #ALL links eg <a href="...">...</a> print(link.get('href')) #url only eg https://www... room = soup.find("span", {"class": "room"}) print(room) #<span class="room">A1</span> = first room for rooms in soup.find_all("span", {"class": "room"}): print(rooms) #same output as previous but all rooms print(rooms.get_text()) #strips HTML #more docs - https://www.crummy.com/software/BeautifulSoup/bs4/doc/