8000 updates · ThinkCode/python-scripts@53da94f · GitHub
[go: up one dir, main page]

Skip to content

Commit 53da94f

Browse files
committed
updates
1 parent 239a0ff commit 53da94f

File tree

3 files changed

+37
-53
lines changed

3 files changed

+37
-53
lines changed

08_basic_email_web_crawler.py

+20-24
Original file line numberDiff line numberDiff line change
@@ -7,40 +7,36 @@
77
link_re = re.compile(r'href="(.*?)"')
88

99

10-
def crawl(url, maxlevel):
10+
def crawl(url):
1111

1212
result = set()
1313

14-
while maxlevel > 0:
14+
req = requests.get(url)
1515

16-
# Get the webpage
17-
req = requests.get(url)
16+
# Check if successful
17+
if(req.status_code != 200):
18+
return []
1819

19-
# Check if successful
20-
if(req.status_code != 200):
21-
return []
20+
# Find links
21+
links = link_re.findall(req.text)
2222

23-
# Find and follow all the links
24-
links = link_re.findall(req.text)
25-
for link in links:
26-
# Get an absolute URL for a link
27-
link = urlparse.urljoin(url, link)
23+
print "\nFound {} links".format(len(links))
2824

29-
# Find all emails on current page
30-
result.update(email_re.findall(req.text))
25+
# Search links for emails
26+
for link in links:
3127

32-
print "Crawled level: {}".format(maxlevel)
28+
# Get an absolute URL for a link
29+
link = urlparse.urljoin(url, link)
3330

34-
# new level
35-
maxlevel -= 1
36-
37-
# recurse
38-
crawl(link, maxlevel)
31+
# Find all emails on current page
32+
result.update(email_re.findall(req.text))
3933

4034
return result
4135

42-
emails = crawl('http://www.website_goes_here_dot_com', 2)
36+
if __name__ == '__main__':
37+
emails = crawl('http://www.realpython.com')
4338

44-
print "\nScrapped e-mail addresses:"
45-
for email in emails:
46-
print email
39+
print "\nScrapped e-mail addresses:"
40+
for email in emails:
41+
print email
42+
print "\n"

09_basic_link_web_crawler.py

+15-27
Original file line numberDiff line numberDiff line change
@@ -6,39 +6,27 @@
66
link_re = re.compile(r'href="(.*?)"')
77

88

9-
def crawl(url, maxlevel):
9+
def crawl(url):
1010

11-
result = set()
11+
req = requests.get(url)
1212

13-
while maxlevel > 0:
13+
# Check if successful
14+
if(req.status_code != 200):
15+
return []
1416

15-
# Get the webpage
16-
req = requests.get(url)
17+
# Find links
18+
links = link_re.findall(req.text)
1719

18-
# Check if successful
19-
if(req.status_code != 200):
20-
return []
20+
print "\nFound {} links".format(len(links))
2121

22-
# Find and follow all the links
23-
links = link_re.findall(req.text)
24-
for link in links:
25-
# Get an absolute URL for a link
26-
link = urlparse.urljoin(url, link)
27-
# add links to result set
28-
result.update(link)
22+
# Search links for emails
23+
for link in links:
2924

30-
print "Crawled level: {}".format(maxlevel)
25+
# Get an absolute URL for a link
26+
link = urlparse.urljoin(url, link)
3127

32-
# new level
33-
maxlevel -= 1
28+
print link
3429

35-
# recurse
36-
crawl(link, maxlevel)
37 A3D4 30

38-
return result
39-
40-
emails = crawl('http://www.website_goes_here_dot_com', 2)
41-
42-
print "\nScrapped links:"
43-
for link in links:
44-
print link
31+
if __name__ == '__main__':
32+
crawl('http://www.realpython.com')

readme.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77
1. **05_load_json_without_dupes.py**: load json, convert to dict, raise error if there is a duplicate key
88
1. **06_execution_time.py**: class used for timing execution of code
99
1. **07_benchmark_permissions_loading_django.py**: benchmark loading of permissions in Django
10-
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website recursively
11-
1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website recursively
10+
1. **08_basic_email_web_crawler.py**: web crawler for grabbing emails from a website
11+
1. **09_basic_link_web_crawler.py**: web crawler for grabbing links from a website
1212
1. **10_find_files_recursively.py**: recursively grab files from a directory

0 commit comments

Comments
 (0)
0