8000 added link scraper · samucc/python-scripts@776ed9c · GitHub
[go: up one dir, main page]

Skip to content

Commit 776ed9c

Browse files
committed
added link scraper
1 parent 7409f60 commit 776ed9c

File tree

2 files changed

+53
-8
lines changed

2 files changed

+53
-8
lines changed

08_basic_email_web_crawler.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
77
link_re = re.compile(r'href="(.*?)"')
88

9+
910
def crawl(url, maxlevel):
1011

1112
result = set()
@@ -25,21 +26,21 @@ def crawl(url, maxlevel):
2526
# Get an absolute URL for a link
2627
link = urlparse.urljoin(url, link)
2728

28-
# Find all emails on current page
29-
result.update(email_re.findall(req.text))
29+
# Find all emails on current page
30+
result.update(email_re.findall(req.text))
3031

31-
print "Crawled level: {}".format(maxlevel)
32+
print "Crawled level: {}".format(maxlevel)
3233

33-
# new level
34-
maxlevel -= 1
34+
# new level
35+
maxlevel -= 1
3536

36-
# recurse
37-
crawl(link, maxlevel)
37+
# recurse
38+
crawl(link, maxlevel)
3839

3940
return result
4041

4142
emails = crawl('http://www.website_goes_here_dot_com', 2)
4243

4344
print "\nScrapped e-mail addresses:"
4445
for email in emails:
45-
print email
46+
print email

09_basic_link_web_crawler.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import requests
2+
import re
3+
import urlparse
4+
5+
# regex
6+
link_re = re.compile(r'href="(.*?)"')
7+
8+
9+
def crawl(url, maxlevel):
10+
11+
result = set()
12+
13+
while maxlevel > 0:
14+
15+
# Get the webpage
16+
req = requests.get(url)
17+
18+
# Check if successful
19+
if(req.status_code != 200):
20+
return []
21+
22+
# Find and follow all the links
23+
links = link_re.findall(req.text)
24+
for link in links:
25+
# Get an absolute URL for a link
26+
link = urlparse.urljoin(url, link)
27+
# add links to result set
28+
result.update(link)
29+
30+
print "Crawled level: {}".format(maxlevel)
31+
32+
# new level
33+
maxlevel -= 1
34+
35+
# recurse
36+
crawl(link, maxlevel)
37+
38+
return result
39+
40+
emails = crawl('http://www.website_goes_here_dot_com', 2)
41+
42+
print "\nScrapped links:"
43+
for link in links:
44+
print link

0 commit comments

Comments
 (0)
0