8000 Added section 11, web scraping project. · jaaks/complete-python-course@3da6b6e · GitHub
[go: up one dir, main page]

Skip to content

Commit 3da6b6e

Browse files
committed
Added section 11, web scraping project.
1 parent a628745 commit 3da6b6e

23 files changed

+567
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
logs.txt
12
data.db
23
*.key
34
*.indd
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import requests
2+
import logging
3+
4+
from pages.all_books_page import AllBooksPage
5+
6+
logging.basicConfig(format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
7+
datefmt='%d-%m-%Y %H:%M:%S',
8+
level=logging.INFO,
9+
filename='logs.txt')
10+
logger = logging.getLogger('scraping')
11+
12+
print('Loading books list...')
13+
logger.info('Loading books list.')
14+
15+
logger.info('Requesting http://books.toscrape.com')
16+
page_content = requests.get('http://books.toscrape.com').content
17+
18+
logger.debug('Creating AllBooksPage from page content.')
19+
page = AllBooksPage(page_content)
20+
21+
_books = []
22+
23+
logger.info(f'Going through {page.page_count} pages of books...')
24+
for page_num in range(page.page_count):
25+
url = f'http://books.toscrape.com/catalogue/page-{page_num+1}.html'
26+
logger.info(f'Requesting {url}')
27+
page_content = requests.get(url).content
28+
logger.debug('Creating AllBooksPage from page content.')
29+
page = AllBooksPage(page_content)
30+
_books.extend(page.books)
31+
32+
books = _books

section11/projects/scraping-books/locators/__init__.py

Whitespace-only changes.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
class AllBooksPageLocators:
2+
BOOKS = 'div.page_inner section li.col-xs-6.col-sm-4.col-md-3.col-lg-3'
3+
PAGER = 'div.page_inner section ul.pager li.current'
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
class BookLocators:
2+
"""
3+
Locators for an item in the HTML page.
4+
5+
This allows us to easily see what our code will be looking at
6+
as well as change it quickly if we notice it is now different.
7+
"""
8+
NAME_LOCATOR = 'article.product_pod h3 a'
9+
LINK_LOCATOR = 'article.product_pod h3 a'
10+
PRICE_LOCATOR = 'article.product_pod p.price_color'
11+
RATING_LOCATOR = 'article.product_pod p.star-rating'
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import logging
2+
3+
from app import books
4+
5+
logger = logging.getLogger('scraping.menu')
6+
7+
8+
USER_CHOICE = '''Enter one of the following
9+
10+
- 'b' to look at 5-star books
11+
- 'c' to look at the cheapest books
12+
- 'n' to just get the next available book on the page
13+
- 'q' to exit
14+
15+
Enter your choice: '''
16+
17+
18+
def print_best_books():
19+
logger.debug('Finding best books by rating...')
20+
best_books = sorted(books, key=lambda x: x.rating * -1)[:5]
21+
for book in best_books:
22+
print(book)
23+
24+
25+
def print_cheapest_books():
26+
logger.debug('Finding best books by price...')
27+
cheapest_books = sorted(books, key=lambda x: x.price)[:5]
28+
for book in cheapest_books:
29+
print(book)
30+
31+
32+
books_generator = (x for x in books)
33+
34+
35+
def get_next_book():
36+
logger.debug('Getting next book from generator of all books...')
37+
print(next(books_generator))
38+
39+
40+
user_choices = {
41+
'b': print_best_books,
42+
'c': print_cheapest_books,
43+
'n': get_next_book
44+
}
45+
46+
47+
def menu():
48+
user_input = input(USER_CHOICE)
49+
while user_input != 'q':
50+
logger.debug('User did not choose to exit program.')
51+
if user_input in ('b', 'c', 'n'):
52+
user_choices[user_input]()
53+
else:
54+
print('Please choose a valid command.')
55+
user_input = input(USER_CHOICE)
56+
logger.debug('Terminating program...')
57+
58+
59+
menu()

section11/projects/scraping-books/pages/__init__.py

Whitespace-only changes.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import re
2+
import logging
3+
4+
from locators.all_books_page import AllBooksPageLocators
5+
from parsers.book import BookParser
6+
from bs4 import BeautifulSoup
7+
8+
logger = logging.getLogger('scraping.all_books_page')
9+
10+
11+
class AllBooksPage:
12+
def __init__(self, page):
13+
logger.debug('Parsing page content with BeautifulSoup HTML parser.')
14+
self.soup = BeautifulSoup(page, 'html.parser')
15+
16+
@property
17+
def books(self):
18+
logger.debug(f'Finding all books in the page using `{AllBooksPageLocators.BOOKS}`')
19+
return [BookParser(e) for e in self.soup.select(AllBooksPageLocators.BOOKS)]
20+
21+
@property
22+
def page_count(self):
23+
logger.debug('Finding all number of catalogue pages available...')
24+
content = self.soup.select_one(AllBooksPageLocators.PAGER).string
25+
logger.info(f'Found number of catalogue pages available: `{content}`')
26+
pattern = 'Page [0-9]+ of ([0-9]+)'
27+
matcher = re.search(pattern, content)
28+
pages = int(matcher.group(1))
29+
logger.info(f'Extracted number of pages as integer: `{pages}`.')
30+
return pages

section11/projects/scraping-books/parsers/__init__.py

Whitespace-only changes.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import re
2+
import logging
3+
4+
from locators.book_locators import BookLocators
5+
6+
logger = logging.getLogger('scraping.book_parser')
7+
8+
9+
class BookParser:
10+
"""
11+
A class to take in an HTML page or content, and find properties of an item
12+
in it.
13+
"""
14+
15+
RATINGS = {
16+
'One': 1,
17+
'Two': 2,
18+
'Three': 3,
19+
'Four': 4,
20+
'Five': 5
21+
}
22+
23+
def __init__(self, parent):
24+
logger.debug(f'New book parser created from `{parent}`')
25+
self.parent = parent
26+
27+
def __repr__(self):
28+
return f'<Book {self.name} {self.price}, {self.rating} stars>'
29+
30+
@property
31+
def name(self):
32+
logger.debug('Finding book name...')
33+
locator = BookLocators.NAME_LOCATOR
34+
item_name = self.parent.select_one(locator).attrs['title']
35+
logger.info(f'Found book name, `{item_name}`.')
36+
return item_name
37+
38+
@property
39+
def link(self):
40+
logger.debug('Finding book page link...')
41+
locator = BookLocators.LINK_LOCATOR
42+
item_url = self.parent.select_one(locator).attrs['href']
43+
logger.info(f'Found book page link, `{item_url}`.')
44+
return item_url
45+
46+
@property
47+
def price(self):
48+
logger.debug('Finding book price...')
49+
locator = BookLocators.PRICE_LOCATOR
50+
item_price = self.parent.select_one(locator).string
51+
logger.debug(f'Item price element found, `{item_price}`')
52+
53+
pattern = '£([0-9]+\.[0-9]+)'
54+
matcher = re.search(pattern, item_price)
55+
price = float(matcher.group(1))
56+
logger.info(f'Found book price, `{price}`.')
57+
return price
58+
59+
@property
60+
def rating(self):
61+
logger.debug('Finding book rating...')
62+
locator = BookLocators.RATING_LOCATOR
63+
star_rating_element = self.parent.select_one(locator)
64+
classes = star_rating_element.attrs['class']
65+
rating_classes = filter(lambda x: x != 'star-rating', classes)
66+
rating_class = next(rating_classes)
67+
68+
logger.debug(f'Found rating class, `{rating_class}`.')
69+
logger.debug('Converting to integer for sorting.')
70+
rating = BookParser.RATINGS.get(rating_class)
71+
logger.info(f'Found book rating, `{rating}`.')
72+
return rating

0 commit comments

Comments
 (0)
0