8000 Merge pull request #5 from RajuKoushik/patch-1 · glowerojo/python-scripts@780cad2 · GitHub
[go: up one dir, main page]

Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 780cad2

Browse files
committed
Merge pull request realpython#5 from RajuKoushik/patch-1
Update 08_basic_email_web_crawler.py
2 parents 7591683 + 761e0ec commit 780cad2

File tree

1 file changed

+14
-33
lines changed

1 file changed

+14
-33
lines changed

08_basic_email_web_crawler.py

Lines changed: 14 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,26 @@
11
import requests
22
import re
3-
try:
4-
from urllib.parse import urljoin
5-
except ImportError:
6-
from urlparse import urljoin
73

8-
# regex
9-
email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
10-
link_re = re.compile(r'href="(.*?)"')
4+
#get url
5+
#url=input('Enter a URL (include 'http://'):')--this is wrong
6+
url = input('Enter a URL (include `http://`): ')
117

128

13-
def crawl(url):
9+
#connect to the url
10+
website=requests.get(url)
1411

15-
result = set()
12+
#read html
13+
html=website.text
1614

17-
req = requests.get(url)
1815

19-
# Check if successful
20-
if(req.status_code != 200):
21-
return []
16+
#use re.findall to grab all the links
17+
links = re.findall('"((http|ftp)s?://.*?)"', html)
2218

23-
# Find links
24-
links = link_re.findall(req.text)
19+
emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)
2520

26-
print("\nFound {} links".format(len(links)))
2721

28-
# Search links for emails
29-
for link in links:
22+
#prints the number of links in the list
23+
print("\nFound {} links".format(len(links)))
3024

31-
# Get an absolute URL for a link
32-
link = urljoin(url, link)
33-
34-
# Find all emails on current page
35-
result.update(email_re.findall(req.text))
36-
37-
return result
38-
39-
if __name__ == '__main__':
40-
emails = crawl('http://www.realpython.com')
41-
42-
print("\nScrapped e-mail addresses:")
43-
for email in emails:
44-
print(email)
45-
print("\n")
25+
for email in emails:
26+
print(email)

0 commit comments

Comments
 (0)
0