Saigelaw
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎section11/projects/scraping-books/app.py
Lines changed: 32 additions & 0 deletions b/‎section11/projects/scraping-books/app.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎section11/projects/scraping-books/locators/__init__.py b/‎section11/projects/scraping-books/locators/__init__.py
diff --git a/‎section11/projects/scraping-books/locators/all_books_page.py
Lines changed: 3 additions & 0 deletions b/‎section11/projects/scraping-books/locators/all_books_page.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎section11/projects/scraping-books/locators/book_locators.py
Lines changed: 11 additions & 0 deletions b/‎section11/projects/scraping-books/locators/book_locators.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎section11/projects/scraping-books/menu.py
Lines changed: 59 additions & 0 deletions b/‎section11/projects/scraping-books/menu.py
Lines changed: 59 additions & 0 deletions
diff --git a/‎section11/projects/scraping-books/pages/__init__.py b/‎section11/projects/scraping-books/pages/__init__.py
diff --git a/‎section11/projects/scraping-books/pages/all_books_page.py
Lines changed: 30 additions & 0 deletions b/‎section11/projects/scraping-books/pages/all_books_page.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎section11/projects/scraping-books/parsers/__init__.py b/‎section11/projects/scraping-books/parsers/__init__.py
diff --git a/‎section11/projects/scraping-books/parsers/book.py
Lines changed: 72 additions & 0 deletions b/‎section11/projects/scraping-books/parsers/book.py
Lines changed: 72 additions & 0 deletions
diff --git a/‎section11/projects/scraping-quotes/app.py
Lines changed: 9 additions & 0 deletions b/‎section11/projects/scraping-quotes/app.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎section11/projects/scraping-quotes/locators/__init__.py b/‎section11/projects/scraping-quotes/locators/__init__.py
diff --git a/‎section11/projects/scraping-quotes/locators/quote_locators.py
Lines changed: 4 additions & 0 deletions b/‎section11/projects/scraping-quotes/locators/quote_locators.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎section11/projects/scraping-quotes/locators/quotes_page_locators.py
Lines changed: 2 additions & 0 deletions b/‎section11/projects/scraping-quotes/locators/quotes_page_locators.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎section11/projects/scraping-quotes/pages/__init__.py b/‎section11/projects/scraping-quotes/pages/__init__.py
diff --git a/‎section11/projects/scraping-quotes/pages/quotes_page.py
Lines changed: 13 additions & 0 deletions b/‎section11/projects/scraping-quotes/pages/quotes_page.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎section11/projects/scraping-quotes/parsers/__init__.py b/‎section11/projects/scraping-quotes/parsers/__init__.py
diff --git a/‎section11/projects/scraping-quotes/parsers/quote.py
Lines changed: 24 additions & 0 deletions b/‎section11/projects/scraping-quotes/parsers/quote.py
Lines changed: 24 additions & 0 deletions
diff --git a/‎section11/projects/understanding-html/class_html_parsing.py
Lines changed: 74 additions & 0 deletions b/‎section11/projects/understanding-html/class_html_parsing.py
Lines changed: 74 additions & 0 deletions
@@ -1,3 +1,4 @@
+logs.txt
 data.db
 *.key
 *.indd
 
@@ -0,0 +1,32 @@
+import requests
+import logging
+
+from pages.all_books_page import AllBooksPage
+
+logging.basicConfig(format='%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
+                    datefmt='%d-%m-%Y %H:%M:%S',
+                    level=logging.INFO,
+                    filename='logs.txt')
+logger = logging.getLogger('scraping')
+
+print('Loading books list...')
+logger.info('Loading books list.')
+
+logger.info('Requesting http://books.toscrape.com')
+page_content = requests.get('http://books.toscrape.com').content
+
+logger.debug('Creating AllBooksPage from page content.')
+page = AllBooksPage(page_content)
+
+_books = []
+
+l
F438
ogger.info(f'Going through {page.page_count} pages of books...')
+for page_num in range(page.page_count):
+    url = f'http://books.toscrape.com/catalogue/page-{page_num+1}.html'
+    logger.info(f'Requesting {url}')
+    page_content = requests.get(url).content
+    logger.debug('Creating AllBooksPage from page content.')
+    page = AllBooksPage(page_content)
+    _books.extend(page.books)
+
+books = _books
@@ -0,0 +1,3 @@
+class AllBooksPageLocators:
+    BOOKS = 'div.page_inner section li.col-xs-6.col-sm-4.col-md-3.col-lg-3'
+    PAGER = 'div.page_inner section ul.pager li.current'
@@ -0,0 +1,11 @@
+class BookLocators:
+    """
+    Locators for an item in the HTML page.
+
+    This allows us to easily see what our code will be looking at
+    as well as change it quickly if we notice it is now different.
+    """
+    NAME_LOCATOR = 'article.product_pod h3 a'
+    LINK_LOCATOR = 'article.product_pod h3 a'
+    PRICE_LOCATOR = 'article.product_pod p.price_color'
+    RATING_LOCATOR = 'article.product_pod p.star-rating'
@@ -0,0 +1,59 @@
+import logging
+
+from app import books
+
+logger = logging.getLogger('scraping.menu')
+
+
+USER_CHOICE = '''Enter one of the following
+
+- 'b' to look at 5-star books
+- 'c' to look at the cheapest books
+- 'n' to just get the next available book on the page
+- 'q' to exit
+
+Enter your choice: '''
+
+
+def print_best_books():
+    logger.debug('Finding best books by rating...')
+    best_books = sorted(books, key=lambda x: x.rating * -1)[:5]
+    for book in best_books:
+        print(book)
+
+
+def print_cheapest_books():
+    logger.debug('Finding best books by price...')
+    cheapest_books = sorted(books, key=lambda x: x.price)[:5]
+    for book in cheapest_books:
+        print(book)
+
+
+books_generator = (x for x in books)
+
+
+def get_next_book():
+    logger.debug('Getting next book from generator of all books...')
+    print(next(books_generator))
+
+
+user_choices = {
+    'b': print_best_books,
+    'c': print_cheapest_books,
+    'n': get_next_book
+}
+
+
+def menu():
+    user_input = input(USER_CHOICE)
+    while user_input != 'q':
+        logger.debug('User did not choose to exit program.')
+        if user_input in ('b', 'c', 'n'):
+            user_choices[user_input]()
+        else:
+            print('Please choose a valid command.')
+        user_input = input(USER_CHOICE)
+    logger.debug('Terminating program...')
+
+
+menu()
@@ -0,0 +1,30 @@
+import re
+import logging
+
+from locators.all_books_page import AllBooksPageLocators
+from parsers.book import BookParser
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger('scraping.all_books_page')
+
+
+class AllBooksPage:
+    def __init__(self, page):
+        logger.debug('Parsing page content with BeautifulSoup HTML parser.')
+        self.soup = BeautifulSoup(page, 'html.parser')
+
+    @property
+    def books(self):
+        logger.debug(f'Finding all books in the page using `{AllBooksPageLocators.BOOKS}`')
+        return [BookParser(e) for e in self.soup.select(AllBooksPageLocators.BOOKS)]
+
+    @property
+    def page_count(self):
+        logger.debug('Finding all number of catalogue pages available...')
+        content = self.soup.select_one(AllBooksPageLocators.PAGER).string
+        logger.info(f'Found number of catalogue pages available: `{content}`')
+        pattern = 'Page [0-9]+ of ([0-9]+)'
+        matcher = re.search(pattern, content)
+        pages = int(matcher.group(1))
+        logger.info(f'Extracted number of pages as integer: `{pages}`.')
+        return pages
@@ -0,0 +1,72 @@
+import re
+import logging
+
+from locators.book_locators import BookLocators
+
+logger = logging.getLogger('scraping.book_parser')
+
+
+class BookParser:
+    """
+    A class to take in an HTML page or content, and find properties of an item
+    in it.
+    """
+
+    RATINGS = {
+        'One': 1,
+        'Two': 2,
+        'Three': 3,
+        'Four': 4,
+        'Five': 5
+    }
+
+    def __init__(self, parent):
+        logger.debug(f'New book parser created from `{parent}`')
+        self.parent = parent
+
+    def __repr__(self):
+        return f'<Book {self.name} {self.price}, {self.rating} stars>'
+
+    @property
+    def name(self):
+        logger.debug('Finding book name...')
+        locator = BookLocators.NAME_LOCATOR
+        item_name = self.parent.select_one(locator).attrs['title']
+        logger.info(f'Found book name, `{item_name}`.')
+        return item_name
+
+    @property
+    def link(self):
+        logger.debug('Finding book page link...')
+        locator = BookLocators.LINK_LOCATOR
+        item_url = self.parent.select_one(locator).attrs['href']
+        logger.info(f'Found book page link, `{item_url}`.')
+        return item_url
+
+    @property
+    def price(self):
+        logger.debug('Finding book price...')
+        locator = BookLocators.PRICE_LOCATOR
+        item_price = self.parent.select_one(locator).string
+        logger.debug(f'Item price element found, `{item_price}`')
+
+        pattern = '£([0-9]+\.[0-9]+)'
+        matcher = re.search(pattern, item_price)
+        price = float(matcher.group(1))
+        logger.info(f'Found book price, `{price}`.')
+        return price
+
+    @property
+    def rating(self):
+        logger.debug('Finding book rating...')
+        locator = BookLocators.RATING_LOCATOR
+        star_rating_element = self.parent.select_one(locator)
+        classes = star_rating_element.attrs['class']
+        rating_classes = filter(lambda x: x != 'star-rating', classes)
+        rating_class = next(rating_classes)
+
+        logger.debug(f'Found rating class, `{rating_class}`.')
+        logger.debug('Converting to integer for sorting.')
+        rating = BookParser.RATINGS.get(rating_class)
+        logger.info(f'Found book rating, `{rating}`.')
+        return rating
@@ -0,0 +1,9 @@
+import requests
+
+from pages.quotes_page import QuotesPage
+
+page_content = requests.get('http://quotes.toscrape.com').content
+page = QuotesPage(page_content)
+
+for quote in page.quotes:
+    print(quote)
@@ -0,0 +1,4 @@
+class QuoteLocators:
+    CONTENT_LOCATOR = 'span.text'
+    AUTHOR_LOCATOR = 'small.author'
+    TAGS_LOCATOR = 'div.tags a.tag'
@@ -0,0 +1,2 @@
+class QuotesPageLocators:
+    QUOTE = 'div.quote'
@@ -0,0 +1,13 @@
+from bs4 import BeautifulSoup
+
+from locators.quotes_page_locators import QuotesPageLocators
+from parsers.quote import QuoteParser
+
+
+class QuotesPage:
+    def __init__(self, page):
+        self.soup = BeautifulSoup(page, 'html.parser')
+
+    @property
+    def quotes(self):
+        return [QuoteParser(e) for e in self.soup.select(QuotesPageLocators.QUOTE)]
@@ -0,0 +1,24 @@
+from locators.quote_locators import QuoteLocators
+
+
+class QuoteParser:
+    def __init__(self, parent):
+        self.parent = parent
+
+    def __repr__(self):
+        return f'<Quote {self.content}, by {self.author}>'
+
+    @property
+    def content(self):
+        locator = QuoteLocators.CONTENT_LOCATOR
+        return self.parent.select_one(locator).string
+
+    @property
+    def author(self):
+        locator = QuoteLocators.AUTHOR_LOCATOR
+        return self.parent.select_one(locator).string
+
+    @property
+    def tags(self):
+        locator = QuoteLocators.TAGS_LOCATOR
+        return self.parent.select(locator)
@@ -0,0 +1,74 @@
+import re
+
+from bs4 import BeautifulSoup
+
+
+ITEM_HTML = '''<html><head></head><body>
+<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
+    <article class="product_pod">
+            <div class="image_container">
+                    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" alt="A Light in the Attic" class="thumbnail"></a>
+            </div>
+                <p class="star-rating Three">
+                    <i class="icon-star"></i>
+                    <i class="icon-star"></i>
+                    <i class="icon-star"></i>
+                    <i class="icon-star"></i>
+                    <i class="icon-star"></i>
+                </p>
+            <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
+            <div class="product_price">
+        <p class="price_color">£51.77</p>
+<p class="instock availability">
+    <i class="icon-ok"></i>
+
+        In stock
+
+</p>
+    <form>
+        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
+    </form>
+            </div>
+    </article>
+</li>
+
+</body></html>
+'''
+
+
+class ParsedItem:
+    """
+    A class to take in an HTML page or content, and find properties of an item
+    in it.
+    """
+    def __init__(self, page):
+        self.soup = BeautifulSoup(page, 'html.parser')
+
+    def name(self):
+        locator = 'article.product_pod h3 a'
+        item_name = self.soup.select_one(locator).attrs['title']
+        return item_name
+
+    def link(self):
+        locator = 'article.product_pod h3 a'
+        item_url = self.soup.select_one(locator).attrs['href']
+        return item_url
+
+    def price(self):
+        locator = 'article.product_pod p.price_color'
+        item_price = self.soup.select_one(locator).string
+
+        pattern = '£([0-9]+\.[0-9]+)'
+        matcher = re.search(pattern, item_price)
+        return float(matcher.group(1))
+
+    def rating(self):
+        locator = 'article.product_pod p.star-rating'
+        star_rating_element = self.soup.select_one(locator)
+        classes = star_rating_element.attrs['class']
+        rating_classes = filter(lambda x: x != 'star-rating', classes)
+        return next(rating_classes)
+
+
+item = ParsedItem(ITEM_HTML)
+print(item.price())
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+logs.txt`
`1`	`2`	`data.db`
`2`	`3`	`*.key`
`3`	`4`	`*.indd`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+class AllBooksPageLocators:`
	`2`	`+ BOOKS = 'div.page_inner section li.col-xs-6.col-sm-4.col-md-3.col-lg-3'`
	`3`	`+ PAGER = 'div.page_inner section ul.pager li.current'`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+class QuotesPageLocators:`
	`2`	`+ QUOTE = 'div.quote'`