Snippets

# Get names of cast from a wikipedia movie page
import requests
import bs4
import re

response = requests.get("https://en.wikipedia.org/wiki/Sholay")
raw_html = response.text

structured_html = bs4.BeautifulSoup(raw_html, "lxml")
tag_elements = structured_html.select(".div-col")
for tag_element in tag_elements:
    for elements in tag_element.contents:
        if isinstance(elements, bs4.element.Tag):
            for element in elements.contents:
                if isinstance(element, bs4.element.Tag):
                    print(element.a.contents[0])

# Get all movie poster images from a wikipedia page
import requests
import bs4
import re

response = requests.get("https://en.wikipedia.org/wiki/Sholay")
raw_html = response.text

structured_html = bs4.BeautifulSoup(raw_html, "lxml")

image_class_elements = structured_html.select(".mw-file-element")
image_link_dict = {}
image_suffix = 'poster.jpg'
# creat a dict with the image name and url
for image_src in image_class_elements:
    if re.search(r'(poster.jpg)$', image_src['src']):
        # find the first pattern match ending with 'poster.jpg' and not having '/'
        image_name = re.search(r'([^/]+)(?=poster.jpg)', image_src['src']).group() + image_suffix
        image_link_dict[image_name] = 'https:' + image_src['src']

# for all items in the dict, write file to disk
# Since we are writing images, we need to use binary mode
for key, value in image_link_dict.items():
    print(key, value)
    with open(key,'wb') as f:
        f.write(requests.get(value).content)

# Get all unique author names from http://quotes.toscrape.com/
# Another sample url for practicing web scraping is http://books.toscrape.com/index.html

# Initialize variables
page_is_valid = True
authors = set()
page = 1


# If a page number does not exist, "No quotes found!" is dipalyed when trying to access the page
while page_is_valid:

    # Create page url dynammically based on the page number
    page_url = url+str(page)

    # Obtain Request
    res = requests.get(page_url)

    # If we try to access a page beyond the last page, "No quotes found!" is dipalyed
    if "No quotes found!" in res.text:
        break

    # Turn into Soup
    soup = bs4.BeautifulSoup(res.text,'lxml')

    # Add Authors to our set
    for name in soup.select(".author"):
        authors.add(name.text)

    # Go to Next Page
    page += 1

print(f"Unique list of authors:\n{authors}")