8000 Web scraping, csv, and command-line tools. · pythonpeixun/practice-python@f237ab1 · GitHub
[go: up one dir, main page]

Skip to content

Commit f237ab1

Browse files
committed
Web scraping, csv, and command-line tools.
1 parent a0febcf commit f237ab1

File tree

4 files changed

+85
-0
lines changed

4 files changed

+85
-0
lines changed

data-science/books.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from bs4 import BeautifulSoup
2+
import csv
3+
import pprint
4+
import re
5+
import requests
6+
import time
7+
8+
9+
def get_book_data(element):
10+
"""given a BeautifulSoup Tag representing a book,
11+
extract the book's details and return a dict"""
12+
13+
title = element.find('div', 'thumbheader').a.text
14+
by_author = element.find('div', 'AuthorName').text
15+
authors = [x.strip()
16+
for x in re.sub("by ", '', by_author, flags=re.IGNORECASE).split(',')
17+
]
18+
# price = element.find('span', 'price').text.strip()
19+
20+
return {
21+
'title': title,
22+
# 'price': price,
23+
'authors': authors,
24+
}
25+
26+
27+
def main():
28+
NUM_PAGES = 31
29+
books = []
30+
31+
base_url = 'http://shop.oreilly.com/category/browse-subjects/data.do?sortby=publicationDate&page='
32+
33+
for page_num in range(1, NUM_PAGES + 1):
34+
print("souping page", page_num, ",", len(books), " found so far")
35+
html = requests.get(base_url + str(page_num)).text
36+
soup = BeautifulSoup(html, 'html5lib')
37+
books.extend([get_book_data(group) for group in soup('td', 'thumbtext')])
38+
39+
time.sleep(30)
40+
41+
with open('books.txt', 'w') as file:
42+
writer = csv.writer(file, delimiter=',')
43+
writer.writerow(["Title", "Authors"])
44+
for book in books:
45+
writer.writerow([book['title'], ', '.join(book['authors'])])
46+
47+
pprint.pprint(books)
48+
49+
50+
if __name__ == '__main__':
51+
main()

data-science/egrep.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import re
2+
import sys
3+
4+
regex = sys.argv[1]
5+
6+
for line in sys.stdin:
7+
if re.search(regex, line):
8+
sys.stdout.write(line)

data-science/line-count.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import sys
2+
3+
count = 0
4+
for line in sys.stdin:
5+
count += 1
6+
7+
print(count)

data-science/most_common_words.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from collections import Counter
2+
import sys
3+
4+
try:
5+
num_words = int(sys.argv[1])
6+
except ValueError:
7+
print("usage: most_common_words.py num_words")
8+
sys.exit(1)
9+
10+
counter = Counter(word.lower()
11+
for line in sys.stdin
12+
for word in line.strip().split()
13+
if word)
14+
15+
for word, count in counter.most_common(num_words):
16+
sys.stdout.write(str(count))
17+
sys.stdout.write("\t")
18+
sys.stdout.write(word)
19+
sys.stdout.write("\n")

0 commit comments

Comments
 (0)
0