Initial Version¶
We will start by writing the
parse_books_page(response: Response)
function that will extract the book URLs from each page.
The function we saw in the introduction is merely returning the total number of books on the index page.
We will now extend it to return all the book URLs for each page.
def parse_books_page(
response: Response, pagination: bool = False
) -> Iterator[dict | Request]:
"""Parse the books page."""
articles = response.html.find_all("article", {"class": "product_pod"})
yield {
"url": response.url,
"title": response.html.title.get_text(strip=True),
"articles": len(articles),
}
for article in articles:
href = article.h3.a["href"]
url = urljoin(response.url, href)
yield Request(url=url, callback=parse_book_details, client=response.client)
if pagination:
next_page = response.html.find("li", {"class": "next"})
if next_page is not None:
next_page_url = urljoin(response.request.url, next_page.a["href"])
yield Request(
url=next_page_url, callback=parse_books_page, client=response.client
)
The function now starts by yielding a dictionary with basic page details, the page URL, title, and the number of books on the page, we we will call this BooksPage for now.
It then proceeds to extract the book URLs and yields a new Request object for each book URL.
Each new request will call the parse_book_details function, which finally returns data about each book, ie BookDetails.
The function is also taking a bool keyword argument, pagination, which will be used to determine if we need to follow the pagination links.
If so, we parse the next page URL and yield a new Request object for the next page which calls back parse_book_details recursively, until we reach the last page and we return.
def parse_book_details(response: Response) -> dict:
"""Parse the book details."""
title = response.html.find("h1").text
price = response.html.find("p", {"class": "price_color"}).text
return {"title": title, "price": price}
Now we can create the Request objects to start the scraping process and run the DataService within a main function.
def main():
start_requests = [
Request(
url="https://books.toscrape.com/index.html",
callback=parse_books_page,
client=HttpXClient(),
)
]
data_service = DataService(start_requests)
data = tuple(data_service)
pprint(data)
Full code for the books_scraper example:
"""Simple example of scraping books from a website with pagination argument."""
import timeit
from pprint import pprint
from typing import Iterator
from urllib.parse import urljoin
from dataservice import DataService, HttpXClient, Request, Response
def parse_books_page(
response: Response, pagination: bool = False
) -> Iterator[dict | Request]:
"""Parse the books page."""
articles = response.html.find_all("article", {"class": "product_pod"})
yield {
"url": response.url,
"title": response.html.title.get_text(strip=True),
"articles": len(articles),
}
for article in articles:
href = article.h3.a["href"]
url = urljoin(response.url, href)
yield Request(url=url, callback=parse_book_details, client=response.client)
if pagination:
next_page = response.html.find("li", {"class": "next"})
if next_page is not None:
next_page_url = urljoin(response.request.url, next_page.a["href"])
yield Request(
url=next_page_url, callback=parse_books_page, client=response.client
)
def parse_book_details(response: Response) -> dict:
"""Parse the book details."""
title = response.html.find("h1").text
price = response.html.find("p", {"class": "price_color"}).text
return {"title": title, "price": price}
def main():
start_requests = [
Request(
url="https://books.toscrape.com/index.html",
callback=parse_books_page,
client=HttpXClient(),
)
]
data_service = DataService(start_requests)
data = tuple(data_service)
pprint(data)
if __name__ == "__main__":
elapsed = timeit.timeit(lambda: main(), number=1)
print(f"Elapsed time: {elapsed:.2f} seconds.")
If you run the script, within a few seconds (pagination is off for now!), you will see a tuple of dictionaries with the books page and book details printed to the console.
({'articles': 20,
'title': 'All products | Books to Scrape - Sandbox',
'url': 'https://books.toscrape.com/index.html'},
{'price': '£51.77', 'title': 'A Light in the Attic'},
{'price': '£22.65', 'title': 'The Requiem Red'},
{'price': '£17.93',
'title': 'The Coming Woman: A Novel Based on the Life of the Infamous '
'Feminist, Victoria Woodhull'},
{'price': '£20.66', 'title': "Shakespeare's Sonnets"},
{'price': '£52.29',
'title': "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)"},
{'price': '£23.88', 'title': 'Olio'},
{'price': '£13.99', 'title': 'Starving Hearts (Triangular Trade Trilogy, #1)'},
{'price': '£47.82', 'title': 'Sharp Objects'},
{'price': '£57.25',
'title': 'Our Band Could Be Your Life: Scenes from the American Indie '
'Underground, 1981-1991'},
{'price': '£52.15', 'title': 'The Black Maria'},
{'price': '£51.33', 'title': 'Libertarianism for Beginners'},
{'price': '£53.74', 'title': 'Tipping the Velvet'},
{'price': '£50.10', 'title': 'Soumission'},
{'price': '£35.02', 'title': 'Rip it Up and Start Again'},
{'price': '£33.34',
'title': 'The Dirty Little Secrets of Getting Your Dream Job'},
{'price': '£22.60',
'title': 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold '
'at the 1936 Berlin Olympics'},
{'price': '£37.59',
'title': 'Mesaerion: The Best Science Fiction Stories 1800-1849'},
{'price': '£17.46', 'title': 'Set Me Free'},
{'price': '£45.17', 'title': "It's Only the Himalayas"},
{'price': '£54.23', 'title': 'Sapiens: A Brief History of Humankind'})
Elapsed time: 1.21 seconds.
Let’s move on the next stage, where we will add some general improvements to the code and introduce a few more features.