-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathmediumScraper.py
More file actions
59 lines (44 loc) · 1.63 KB
/
mediumScraper.py
File metadata and controls
59 lines (44 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
import requests
import argparse
import sys
import json
from urllib import parse
from bs4 import BeautifulSoup
def parseArgs():
parser = argparse.ArgumentParser(description='Gets posts from Medium.com for a specific topic.')
parser.add_argument('topic', metavar='TOPIC', type=str,
help='the topic to search for')
parser.add_argument('-c', '--count', dest='count', action='store', default=15,
help='maximum number of posts')
parser.add_argument('-b', '--beautify', dest='beautiefy', action='store_true',
help='beautiefy json output')
return parser.parse_args()
def run(args):
def parsePost(tag):
title = tag.find('h3', class_='graf')
desc = tag.find('p')
url = tag.find_all('a')[3]
return {
'title': title.text if title else '',
'desc': desc.text if desc else '',
'url': url.get('href').split('?')[0] if url else '',
}
urlParams = {
'topic': parse.quote(args.topic),
'count': args.count
}
url = 'https://medium.com/search/posts?q={topic}&count={count}'.format_map(urlParams)
posts = []
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
rawPosts = soup.find_all('div', class_='postArticle')
if len(rawPosts) > 0:
for post in rawPosts:
posts.append(parsePost(post))
else:
print('No posts found for "%s"...' % args.topic)
sys.exit(0)
print(json.dumps(posts, indent=(4 if args.beautiefy else None)))
if __name__ == '__main__':
run(parseArgs())