File tree 2 files changed +53
-8
lines changed 2 files changed +53
-8
lines changed Original file line number Diff line number Diff line change 6
6
email_re = re .compile (r'([\w\.,]+@[\w\.,]+\.\w+)' )
7
7
link_re = re .compile (r'href="(.*?)"' )
8
8
9
+
9
10
def crawl (url , maxlevel ):
10
11
11
12
result = set ()
@@ -25,21 +26,21 @@ def crawl(url, maxlevel):
25
26
# Get an absolute URL for a link
26
27
link = urlparse .urljoin (url , link )
27
28
28
- # Find all emails on current page
<
8000
td data-grid-cell-id="diff-3e61ef840a7c39fa57763dbf12c994fdef1065789b7b6dc37fae175f74809580-29-28-0" data-selected="false" role="gridcell" style="background-color:var(--diffBlob-deletionNum-bgColor, var(--diffBlob-deletion-bgColor-num));text-align:center" tabindex="-1" valign="top" class="focusable-grid-cell diff-line-number position-relative left-side">29
- result .update (email_re .findall (req .text ))
29
+ # Find all emails on current page
30
+ result .update (email_re .findall (req .text ))
30
31
31
- print "Crawled level: {}" .format (maxlevel )
32
+ print "Crawled level: {}" .format (maxlevel )
32
33
33
- # new level
34
- maxlevel -= 1
34
+ # new level
35
+ maxlevel -= 1
35
36
36
- # recurse
37
- crawl (link , maxlevel )
37
+ # recurse
38
+ crawl (link , maxlevel )
38
39
39
40
return result
40
41
41
42
emails = crawl ('http://www.website_goes_here_dot_com' , 2 )
42
43
43
44
print "\n Scrapped e-mail addresses:"
44
45
for email in emails :
45
- print email
46
+ print email
Original file line number Diff line number Diff line change
1
+ import requests
2
+ import re
3
+ import urlparse
4
+
5
+ # regex
6
+ link_re = re .compile (r'href="(.*?)"' )
7
+
8
+
9
+ def crawl (url , maxlevel ):
10
+
11
+ result = set ()
12
+
13
+ while maxlevel > 0 :
14
+
15
+ # Get the webpage
16
+ req = requests .get (url )
17
+
18
+ # Check if successful
19
+ if (req .status_code != 200 ):
20
+ return []
21
+
22
+ # Find and follow all the links
23
+ links = link_re .findall (req .text )
24
+ for link in links :
25
+ # Get an absolute URL for a link
26
+ link = urlparse .urljoin (url , link )
27
+ # add links to result set
28
+ result .update (link )
29
+
30
+ print "Crawled level: {}" .format (maxlevel )
31
+
32
+ # new level
33
+ maxlevel -= 1
34
+
35
+ # recurse
36
+ crawl (link , maxlevel )
37
+
38
+ return result
39
+
40
+ emails = crawl ('http://www.website_goes_here_dot_com' , 2 )
41
+
42
+ print "\n Scrapped links:"
43
+ for link in links :
44
+ print link
You can’t perform that action at this time.
0 commit comments