example programΒΆ

You can download all the example programs here.

# -*- coding: utf-8 -*-
Fecth the text of the main headline article from the Guardian website.
Print it out and print out the URLs of any links in the text.

import bs4
import requests

# %% Helper functions

def get_and_soupify(url):
    """Gets a webpage and turns it into a BeautifulSoup.

    Argument url: string URL

    Returns: bs4.BeautifulSoup

    Raises RuntimeError: if status of request is not '200 (OK)'

    response = requests.get(url)
    msg = '{} ({}): {}'.format(response.status_code, response.reason, url)

    if response.status_code != 200:
        raise RuntimeError(msg)


    return bs4.BeautifulSoup(response.text, 'lxml')

def print_output(content, title):
    """Print some output with a neat header line.

        content: string content to print
        title: string title to display in header

    header = '#### ' + title + ' ####'
    footer = '#' * len(header)

    print('', header, content, footer, sep='\n')

# %% Main page

main_url = 'https://www.theguardian.com/international'

page = get_and_soupify(main_url)

# Target:
# Within the <section> with id 'headlines' ...
# the first <a> tag ...
# with class 'fc-item__link' ...
# and data-link-name 'article'.

headlines = page.find('section', attrs={'id': 'headlines'})
top_headline = headlines.find('a', attrs={'class': 'fc-item__link', 'data-link-name': 'article'})
article_url = top_headline['href']

# %% Top article page

article_page = get_and_soupify(article_url)

# Target:
# Within the first <div> tag with class 'content__article-body' ...
# all the <p> tags.

main_section = article_page.find('div', attrs={'class': 'content__article-body'})
paragraphs = main_section.find_all('p')

# %% Article text

paragraph_texts = [p.get_text() for p in paragraphs]

print_output('\n'.join(paragraph_texts), 'Body text')

# %% Linked URLs

links = []

for p in paragraphs:
    a_tags = p.find_all('a')
    for a in a_tags:

print_output('\n'.join(links), 'Linked URLs')