8000 Added section 11, web scraping project. · Saigelaw/complete-python-course@3da6b6e · GitHub
[go: up one dir, main page]

Skip to content

Commit 3da6b6e

Browse files
committed
Added section 11, web scraping project.
1 parent a628745 commit 3da6b6e

23 files changed

+567
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
logs.txt
12
data.db
23
*.key
34
*.indd
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import requests
2+
import logging
3+
4+
from pages.all_books_page import AllBooksPage
5+
6+
logging.basicConfig(format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
7+
datefmt='%d-%m-%Y %H:%M:%S',
8+
level=logging.INFO,
9+
filename='logs.txt')
10+
logger = logging.getLogger('scraping')
11+
12+
print('Loading books list...')
13+
logger.info('Loading books list.')
14+
15+
logger.info('Requesting http://books.toscrape.com')
16+
page_content = requests.get('http://books.toscrape.com').content
17+
18+
logger.debug('Creating AllBooksPage from page content.')
19+
page = AllBooksPage(page_content)
20+
21+
_books = []
22+
23+
l F438 ogger.info(f'Going through {page.page_count} pages of books...')
24+
for page_num in range(page.page_count):
25+
url = f'http://books.toscrape.com/catalogue/page-{page_num+1}.html'
26+
logger.info(f'Requesting {url}')
27+
page_content = requests.get(url).content
28+
logger.debug('Creating AllBooksPage from page content.')
29+
page = AllBooksPage(page_content)
30+
_books.extend(page.books)
31+
32+
books = _books

section11/projects/scraping-books/locators/__init__.py

Whitespace-only changes.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
class AllBooksPageLocators:
2+
BOOKS = 'div.page_inner section li.col-xs-6.col-sm-4.col-md-3.col-lg-3'
3+
PAGER = 'div.page_inner section ul.pager li.current'
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
class BookLocators:
2+
"""
3+
Locators for an item in the HTML page.
4+
5+
This allows us to easily see what our code will be looking at
6+
as well as change it quickly if we notice it is now different.
7+
"""
8+
NAME_LOCATOR = 'article.product_pod h3 a'
9+
LINK_LOCATOR = 'article.product_pod h3 a'
10+
PRICE_LOCATOR = 'article.product_pod p.price_color'
11+
RATING_LOCATOR = 'article.product_pod p.star-rating'
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import logging
2+
3+
from app import books
4+
5+
logger = logging.getLogger('scraping.menu')
6+
7+
8+
USER_CHOICE = '''Enter one of the following
9+
10+
- 'b' to look at 5-star books
11+
- 'c' to look at the cheapest books
12+
- 'n' to just get the next available book on the page
13+
- 'q' to exit
14+
15+
Enter your choice: '''
16+
17+
18+
def print_best_books():
19+
logger.debug('Finding best books by rating...')
20+
best_books = sorted(books, key=lambda x: x.rating * -1)[:5]
21+
for book in best_books:
22+
print(book)
23+
24+
25+
def print_cheapest_books():
26+
logger.debug('Finding best books by price...')
27+
cheapest_books = sorted(books, key=lambda x: x.price)[:5]
28+
for book in cheapest_books:
29+
print(book)
30+
31+
32+
books_generator = (x for x in books)
33+
34+
35+
def get_next_book():
36+
logger.debug('Getting next book from generator of all books...')
37+
print(next(books_generator))
38+
39+
40+
user_choices = {
41+
'b': print_best_books,
42+
'c': print_cheapest_books,
43+
'n': get_next_book
44+
}
45+
46+
47+
def menu():
48+
user_input = input(USER_CHOICE)
49+
while user_input != 'q':
50+
logger.debug('User did not choose to exit program.')
51+
if user_input in ('b', 'c', 'n'):
52+
user_choices[user_input]()
53+
else:
54+
print('Please choose a valid command.')
55+
user_input = input(USER_CHOICE)
56+
logger.debug('Terminating program...')
57+
58+
59+
menu()

section11/projects/scraping-books/pages/__init__.py

Whitespace-only changes.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import re
2+
import logging
3+
4+
from locators.all_books_page import AllBooksPageLocators
5+
from parsers.book import BookParser
6+
from bs4 import BeautifulSoup
7+
8+
logger = logging.getLogger('scraping.all_books_page')
9+
10+
11+
class AllBooksPage:
12+
def __init__(self, page):
13+
logger.debug('Parsing page content with BeautifulSoup HTML parser.')
14+
self.soup = BeautifulSoup(page, 'html.parser')
15+
16+
@property
17+
def books(self):
18+
logger.debug(f'Finding all books in the page using `{AllBooksPageLocators.BOOKS}`')
19+
return [BookParser(e) for e in self.soup.select(AllBooksPageLocators.BOOKS)]
20+
21+
@property
22+
def page_count(self):
23+
logger.debug('Finding all number of catalogue pages available...')
24+
content = self.soup.select_one(AllBooksPageLocators.PAGER).string
25+
logger.info(f'Found number of catalogue pages available: `{content}`')
26+
pattern = 'Page [0-9]+ of ([0-9]+)'
27+
matcher = re.search(pattern, content)
28+
pages = int(matcher.group(1))
29+
logger.info(f'Extracted number of pages as integer: `{pages}`.')
30+
return pages

section11/projects/scraping-books/parsers/__init__.py

Whitespace-only changes.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import re
2+
import logging
3+
4+
from locators.book_locators import BookLocators
5+
6+
logger = logging.getLogger('scraping.book_parser')
7+
8+
9+
class BookParser:
10+
"""
11+
A class to take in an HTML page or content, and find properties of an item
12+
in it.
13+
"""
14+
15+
RATINGS = {
16+
'One': 1,
17+
'Two': 2,
18+
'Three': 3,
19+
'Four': 4,
20+
'Five': 5
21+
}
22+
23+
def __init__(self, parent):
24+
logger.debug(f'New book parser created from `{parent}`')
25+
self.parent = parent
26+
27+
def __repr__(self):
28+
return f'<Book {self.name} {self.price}, {self.rating} stars>'
29+
30+
@property
31+
def name(self):
32+
logger.debug('Finding book name...')
33+
locator = BookLocators.NAME_LOCATOR
34+
item_name = self.parent.select_one(locator).attrs['title']
35+
logger.info(f'Found book name, `{item_name}`.')
36+
return item_name
37+
38+
@property
39+
def link(self):
40+
logger.debug('Finding book page link...')
41+
locator = BookLocators.LINK_LOCATOR
42+
item_url = self.parent.select_one(locator).attrs['href']
43+
logger.info(f'Found book page link, `{item_url}`.')
44+
return item_url
45+
46+
@property
47+
def price(self):
48+
logger.debug('Finding book price...')
49+
locator = BookLocators.PRICE_LOCATOR
50+
item_price = self.parent.select_one(locator).string
51+
logger.debug(f'Item price element found, `{item_price}`')
52+
53+
pattern = '£([0-9]+\.[0-9]+)'
54+
matcher = re.search(pattern, item_price)
55+
price = float(matcher.group(1))
56+
logger.info(f'Found book price, `{price}`.')
57+
return price
58+
59+
@property
60+
def rating(self):
61+
logger.debug('Finding book rating...')
62+
locator = BookLocators.RATING_LOCATOR
63+
star_rating_element = self.parent.select_one(locator)
64+
classes = star_rating_element.attrs['class']
65+
rating_classes = filter(lambda x: x != 'star-rating', classes)
66+
rating_class = next(rating_classes)
67+
68+
logger.debug(f'Found rating class, `{rating_class}`.')
69+
logger.debug('Converting to integer for sorting.')
70+
rating = BookParser.RATINGS.get(rating_class)
71+
logger.info(f'Found book rating, `{rating}`.')
72+
return rating
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import requests
2+
3+
from pages.quotes_page import QuotesPage
4+
5+
page_content = requests.get('http://quotes.toscrape.com').content
6+
page = QuotesPage(page_content)
7+
8+
for quote in page.quotes:
9+
print(quote)

section11/projects/scraping-quotes/locators/__init__.py

Whitespace-only changes.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
class QuoteLocators:
2+
CONTENT_LOCATOR = 'span.text'
3+
AUTHOR_LOCATOR = 'small.author'
4+
TAGS_LOCATOR = 'div.tags a.tag'
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class QuotesPageLocators:
2+
QUOTE = 'div.quote'

section11/projects/scraping-quotes/pages/__init__.py

Whitespace-only changes.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from bs4 import BeautifulSoup
2+
3+
from locators.quotes_page_locators import QuotesPageLocators
4+
from parsers.quote import QuoteParser
5+
6+
7+
class QuotesPage:
8+
def __init__(self, page):
9+
self.soup = BeautifulSoup(page, 'html.parser')
10+
11+
@property
12+
def quotes(self):
13+
return [QuoteParser(e) for e in self.soup.select(QuotesPageLocators.QUOTE)]

section11/projects/scraping-quotes/parsers/__init__.py

Whitespace-only changes.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from locators.quote_locators import QuoteLocators
2+
3+
4+
class QuoteParser:
5+
def __init__(self, parent):
6+
self.parent = parent
7+
8+
def __repr__(self):
9+
return f'<Quote {self.content}, by {self.author}>'
10+
11+
@property
12+
def content(self):
13+
locator = QuoteLocators.CONTENT_LOCATOR
14+
return self.parent.select_one(locator).string
15+
16+
@property
17+
def author(self):
18+
locator = QuoteLocators.AUTHOR_LOCATOR
19+
return self.parent.select_one(locator).string
20+
21+
@property
22+
def tags(self):
23+
locator = QuoteLocators.TAGS_LOCATOR
24+
return self.parent.select(locator)
Lines changed: 74 additions & 0 deletions
< 37CA /tr>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import re
2+
3+
from bs4 import BeautifulSoup
4+
5+
6+
ITEM_HTML = '''<html><head></head><body>
7+
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
8+
<article class="product_pod">
9+
<div class="image_container">
10+
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" alt="A Light in the Attic" class="thumbnail"></a>
11+
</div>
12+
<p class="star-rating Three">
13+
<i class="icon-star"></i>
14+
<i class="icon-star"></i>
15+
<i class="icon-star"></i>
16+
<i class="icon-star"></i>
17+
<i class="icon-star"></i>
18+
</p>
19+
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
20+
<div class="product_price">
21+
<p class="price_color">£51.77</p>
22+
<p class="instock availability">
23+
<i class="icon-ok"></i>
24+
25+
In stock
26+
27+
</p>
28+
<form>
29+
<button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
30+
</form>
31+
</div>
32+
</article>
33+
</li>
34+
35+
</body></html>
36+
'''
37+
38+
39+
class ParsedItem:
40+
"""
41+
A class to take in an HTML page or content, and find properties of an item
42+
in it.
43+
"""
44+
def __init__(self, page):
45+
self.soup = BeautifulSoup(page, 'html.parser')
46+
47+
def name(self):
48+
locator = 'article.product_pod h3 a'
49+
item_name = self.soup.select_one(locator).attrs['title']
50+
return item_name
51+
52+
def link(self):
53+
locator = 'article.product_pod h3 a'
54+
item_url = self.soup.select_one(locator).attrs['href']
55+
return item_url
56+
57+
def price(self):
58+
locator = 'article.product_pod p.price_color'
59+
item_price = self.soup.select_one(locator).string
60+
61+
pattern = '£([0-9]+\.[0-9]+)'
62+
matcher = re.search(pattern, item_price)
63+
return float(matcher.group(1))
64+
65+
def rating(self):
66+
locator = 'article.product_pod p.star-rating'
67+
star_rating_element = self.soup.select_one(locator)
68+
classes = star_rating_element.attrs['class']
69+
rating_classes = filter(lambda x: x != 'star-rating', classes)
70+
return next(rating_classes)
71+
72+
73+
item = ParsedItem(ITEM_HTML)
74+
print(item.price())

0 commit comments

Comments
 (0)
0