8000 Update 08_basic_email_web_crawler.py · cuartataifa/python-scripts@761e0ec · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 761e0ec

Browse files
committed
Update 08_basic_email_web_crawler.py
This is a much simpler version of the script(easily understandable).
1 parent 436f311 commit 761e0ec

File tree

1 file changed

+14
-33
lines changed

1 file changed

+14
-33
lines changed

08_basic_email_web_crawler.py

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,26 @@
11
import requests
22
import re
3-
try:
4-
from urllib.parse import urljoin
5-
except ImportError:
6-
from urlparse import urljoin
73

8-
# regex
9-
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
10-
link_re = re.compile(r'href="(.*?)"')
4+
#get url
5+
#url=input('Enter a URL (include 'http://'):')--this is wrong
6+
url = input('Enter a URL (include `http://`): ')
117

128

13-
def crawl(url):
9+
#connect to the url
10+
website=requests.get(url)
1411

15-
result = set()
12+
#read html
13+
html=website.text
1614

17-
req = requests.get(url)
1815

19-
# Check if successful
20-
if(req.status_code != 200):
21-
return []
16+
#use re.findall to grab all the links
17+
links = re.findall('"((http|ftp)s?://.*?)"', html)
2218

23-
# Find links
24-
links = link_re.findall(req.text)
19+
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
2520

26-
print("\nFound {} links".format(len(links)))
2721

28-
# Search links for emails
29-
for link in links:
22+
#prints the number of links in the list
23+
print("\nFound {} links".format(len(links)))
3024

31-
# Get an absolute URL for a link
32-
link = urljoin(url, link)
33-
34-
# Find all emails on current page
35-
result.update(email_re.findall(req.text))
36-
37-
return result
38-
39-
if __name__ == '__main__':
40-
emails = crawl('http://www.realpython.com')
41-
42-
print("\nScrapped e-mail addresses:")
43-
for email in emails:
44-
print(email)
45-
print("\n")
25+
for email in emails:
26+
print(email)

0 commit comments

Comments
 (0)
0