8000 Use black to format the code · feedly/python-readability@e9acdd0 · GitHub
[go: up one dir, main page]

Skip to content
This repository was archived by the owner on Jan 4, 2022. It is now read-only.

Commit e9acdd0

Browse files
committed
Use black to format the code
1 parent 5a74140 commit e9acdd0

File tree

9 files changed

+419
-296
lines changed

9 files changed

+419
-296
lines changed

readability/browser.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@ def open_in_browser(html):
77
import os
88
import webbrowser
99
import tempfile
10-
handle, fn = tempfile.mkstemp(suffix='.html')
11-
f = os.fdopen(handle, 'wb')
10+
11+
handle, fn = tempfile.mkstemp(suffix=".html")
12+
f = os.fdopen(handle, "wb")
1213
try:
1314
f.write(b"<meta charset='UTF-8' />")
14-
f.write(html.encode('utf-8'))
15+
f.write(html.encode("utf-8"))
1516
finally:
1617
# we leak the file itself here, but we should at least close it
1718
f.close()
18-
url = 'file://' + fn.replace(os.path.sep, '/')
19+
url = "file://" + fn.replace(os.path.sep, "/")
1920
webbrowser.open(url)
2021
return url

readability/cleaners.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,35 +2,51 @@
22
import re
33
from lxml.html.clean import Cleaner
44

5-
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
5+
bad_attrs = ["width", "height", "style", "[-a-z]*color", "background[-a-z]*", "on*"]
66
single_quoted = "'[^']+'"
77
double_quoted = '"[^"]+"'
8-
non_space = '[^ "\'>]+'
9-
htmlstrip = re.compile("<" # open
10-
"([^>]+) " # prefix
11-
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
12-
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
13-
"([^>]*)" # postfix
14-
">" # end
15-
, re.I)
8+
non_space = "[^ \"'>]+"
9+
htmlstrip = re. F438 compile(
10+
"<" # open
11+
"([^>]+) " # prefix
12+
"(?:%s) *" % ("|".join(bad_attrs),)
13+
+ "= *(?:%s|%s|%s)" # undesirable attributes
14+
% (non_space, single_quoted, double_quoted)
15+
+ "([^>]*)" # value # postfix
16+
">", # end
17+
re.I,
18+
)
1619

1720

1821
def clean_attributes(html):
1922
while htmlstrip.search(html):
20-
html = htmlstrip.sub('<\\1\\2>', html)
23+
html = htmlstrip.sub("<\\1\\2>", html)
2124
return html
2225

2326

2427
def normalize_spaces(s):
2528
if not s:
26-
return ''
29+
return ""
2730
"""replace any sequence of whitespace
2831
characters with a single space"""
29-
return ' '.join(s.split())
32+
return " ".join(s.split())
3033

3134

32-
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
33-
style=True, links=True, meta=False, add_nofollow=False,
34-
page_structure=False, processing_instructions=True, embedded=False,
35-
frames=False, forms=False, annoying_tags=False, remove_tags=None,
36-
remove_unknown_tags=False, safe_attrs_only=False)
35+
html_cleaner = Cleaner(
36+
scripts=True,
37+
javascript=True,
38+
comments=True,
39+
style=True,
40+
links=True,
41+
meta=False,
42+
add_nofollow=False,
43+
page_structure=False,
44+
processing_instructions=True,
45+
embedded=False,
46+
frames=False,
47+
forms=False,
48+
annoying_tags=False,
49+
remove_tags=None,
50+
remove_unknown_tags=False,
51+
safe_attrs_only=False,
52+
)

readability/compat/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
syntax that can only be solved by conditionally importing different functions.
66
"""
77
import sys
8+
89
if sys.version_info[0] == 2:
910
bytes_ = str
1011
str_ = unicode
11-
12+
1213
elif sys.version_info[0] == 3:
1314
bytes_ = bytes
1415
str_ = str

readability/debug.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,25 @@
11
import re
22

33

4-
#FIXME: use with caution, can leak memory
4+
# FIXME: use with caution, can leak memory
55
uids = {}
66
uids_document = None
77

88

99
def describe_node(node):
1010
global uids
1111
if node is None:
12-
return ''
13-
if not hasattr(node, 'tag'):
12+
return ""
13+
if not hasattr(node, "tag"):
1414
return "[%s]" % type(node)
1515
name = node.tag
16-
if node.get('id', ''):
17-
name += '#' + node.get('id')
18-
if node.get('class', '').strip():
19-
name += '.' + '.'.join(node.get('class').split())
20-
if name[:4] in ['div#', 'div.']:
16+
if node.get("id", ""):
17+
name += "#" + node.get("id")
18+
if node.get("class", "").strip():
19+
name += "." + ".".join(node.get("class").split())
20+
if name[:4] in ["div#", "div."]:
2121
name = name[3:]
22-
if name in ['tr', 'td', 'div', 'p']:
22+
if name in ["tr", "td", "div", "p"]:
2323
uid = uids.get(node)
2424
if uid is None:
2525
uid = uids[node] = len(uids) + 1
@@ -34,18 +34,18 @@ def describe(node, depth=1):
3434
uids = {}
3535
uids_document = doc
3636

37-
#return repr(NodeRepr(node))
38-
parent = ''
37+
# return repr(NodeRepr(node))
38+
parent = ""
3939
if depth and node.getparent() is not None:
40-
parent = describe(node.getparent(), depth=depth - 1) + '>'
40+
parent = describe(node.getparent(), depth=depth - 1) + ">"
4141
return parent + describe_node(node)
4242

4343

44-
RE_COLLAPSE_WHITESPACES = re.compile(r'\s+', re.U)
44+
RE_COLLAPSE_WHITESPACES = re.compile(r"\s+", re.U)
4545

4646

4747
def text_content(elem, length=40):
48-
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
48+
content = RE_COLLAPSE_WHITESPACES.sub(" ", elem.text_content().replace("\r", ""))
4949
if len(content) < length:
5050
return content
51-
return content[:length] + '...'
51+
return content[:length] + "..."

readability/encoding.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,16 @@
88
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
99

1010
CHARSETS = {
11-
'big5': 'big5hkscs',
12-
'gb2312': 'gb18030',
13-
'ascii': 'utf-8',
14-
'maccyrillic': 'cp1251',
15-
'win1251': 'cp1251',
16-
'win-1251': 'cp1251',
17-
'windows-1251': 'cp1251',
11+
"big5": "big5hkscs",
12+
"gb2312": "gb18030",
13+
"ascii": "utf-8",
14+
"maccyrillic": "cp1251",
15+
"win1251": "cp1251",
16+
"win-1251": "cp1251",
17+
"windows-1251": "cp1251",
1818
}
1919

20+
2021
def fix_charset(encoding):
2122
"""Overrides encoding when charset declaration
2223
or charset determination is a subset of a larger
@@ -27,9 +28,9 @@ def fix_charset(encoding):
2728

2829
def get_encoding(page):
2930
# Regex for XML and HTML Meta charset declaration
30-
declared_encodings = (RE_CHARSET.findall(page) +
31-
RE_PRAGMA.findall(page) +
32-
RE_XML.findall(page))
31+
declared_encodings = (
32+
RE_CHARSET.findall(page) + RE_PRAGMA.findall(page) + RE_XML.findall(page)
33+
)
3334

3435
# Try any declared encodings
3536
for declared_encoding in declared_encodings:
@@ -38,7 +39,7 @@ def get_encoding(page):
3839
# declared_encoding will actually be bytes but .decode() only
3940
# accepts `str` type. Decode blindly with ascii because no one should
4041
# ever use non-ascii characters in the name of an encoding.
41-
declared_encoding = declared_encoding.decode('ascii', 'replace')
42+
declared_encoding = declared_encoding.decode("ascii", "replace")
4243

4344
encoding = fix_charset(declared_encoding)
4445

@@ -51,12 +52,12 @@ def get_encoding(page):
5152

5253
# Fallback to chardet if declared encodings fail
5354
# Remove all HTML tags, and leave only text for chardet
54-
text = re.sub(br'(\s*</?[^>]*>)+\s*', b' ', page).strip()
55-
enc = 'utf-8'
55+
text = re.sub(br"(\s*</?[^>]*>)+\s*", b" ", page).strip()
56+
enc = "utf-8"
5657
if len(text) < 10:
57-
return enc # can't guess
58+
return enc # can't guess
5859
res = chardet.detect(text)
59-
enc = res['encoding'] or 'utf-8'
60-
#print '->', enc, "%.2f" % res['confidence']
60+
enc = res["encoding"] or "utf-8"
61+
# print '->', enc, "%.2f" % res['confidence']
6162
enc = fix_charset(enc)
6263
return enc

readability/htmls.py

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -6,36 +6,38 @@
66
from .encoding import get_encoding
77
from .compat import str_
88

9-
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
9+
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
1010

1111

1212
def build_doc(page):
1313
if isinstance(page, str_):
1414
encoding = None
1515
decoded_page = page
1616
else:
17-
encoding = get_encoding(page) or 'utf-8'
18-
decoded_page = page.decode(encoding, 'replace')
17+
encoding = get_encoding(page) or "utf-8"
18+
decoded_page = page.decode(encoding, "replace")
1919

2020
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
21-
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
21+
doc = lxml.html.document_fromstring(
22+
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
23+
)
2224
return doc, encoding
2325

2426

2527
def js_re(src, pattern, flags, repl):
26-
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
28+
return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
2729

2830

2931
def normalize_entities(cur_title):
3032
entities = {
31-
u'\u2014':'-',
32-
u'\u2013':'-',
33-
u'&mdash;': '-',
34-
u'&ndash;': '-',
35-
u'\u00A0': ' ',
36-
u'\u00AB': '"',
37-
u'\u00BB': '"',
38-
u'&quot;': '"',
33+
u"\u2014": "-",
34+
u"\u2013": "-",
35+
u"&mdash;": "-",
36+
u"&ndash;": "-",
37+
u"\u00A0": " ",
38+
u"\u00AB": '"',
39+
u"\u00BB": '"',
40+
u"&quot;": '"',
3941
}
4042
for c, r in entities.items():
4143
if c in cur_title:
@@ -49,35 +51,44 @@ def norm_title(title):
4951

5052

5153
def get_title(doc):
52-
title = doc.find('.//title')
54+
title = doc.find(".//title")
5355
if title is None or title.text is None or len(title.text) == 0:
54-
return '[no-title]'
56+
return "[no-title]"
5557< 10000 code class="diff-text syntax-highlighted-line">

5658
return norm_title(title.text)
5759

5860

5961
def add_match(collection, text, orig):
6062
text = norm_title(text)
6163
if len(text.split()) >= 2 and len(text) >= 15:
62-
if text.replace('"', '') in orig.replace('"', ''):
64+
if text.replace('"', "") in orig.replace('"', ""):
6365
collection.add(text)
6466

6567

66-
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
67-
'.news_title', '.title', '.head', '.heading',
68-
'.contentheading', '.small_header_red']
68+
TITLE_CSS_HEURISTICS = [
69+
"#title",
70+
"#head",
71+
"#heading",
72+
".pageTitle",
73+
".news_title",
74+
".title",
75+
".head",
76+
".heading",
77+
".contentheading",
78+
".small_header_red",
79+
]
6980

7081

7182
def shorten_title(doc):
72-
title = doc.find('.//title')
83+
title = doc.find(".//title")
7384
if title is None or title.text is None or len(title.text) == 0:
74-
return ''
85+
return ""
7586

7687
title = orig = norm_title(title.text)
7788

7889
candidates = set()
7990

80-
for item in ['.//h1', './/h2', './/h3']:
91+
for item in [".//h1", ".//h2", ".//h3"]:
8192
for e in list(doc.iterfind(item)):
8293
if e.text:
8394
add_match(candidates, e.text, orig)
@@ -94,7 +105,7 @@ def shorten_title(doc):
94105
if candidates:
95106
title = sorted(candidates, key=len)[-1]
96107
else:
97-
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
108+
for delimiter in [" | ", " - ", " :: ", " / "]:
98109
if delimiter in title:
99110
parts = orig.split(delimiter)
100111
if len(parts[0].split()) >= 4:
@@ -104,12 +115,12 @@ def shorten_title(doc):
104115
title = parts[-1]
105116
break
106117
else:
107-
if ': ' in title:
108-
parts = orig.split(': ')
118+
if ": " in title:
119+
parts = orig.split(": ")
109120
if len(parts[-1].split()) >= 4:
110121
title = parts[-1]
111122
else:
112-
title = orig.split(': ', 1)[1]
123+
title = orig.split(": ", 1)[1]
113124

114125
if not 15 < len(title) < 150:
115126
return orig
@@ -119,15 +130,15 @@ def shorten_title(doc):
119130

120131
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
121132
def get_body(doc):
122-
for elem in doc.xpath('.//script | .//link | .//style'):
133+
for elem in doc.xpath(".//script | .//link | .//style"):
123134
elem.drop_tree()
124135
# tostring() always return utf-8 encoded string
125136
# FIXME: isn't better to use tounicode?
126137
raw_html = str_(tostring(doc.body or doc))
127138
cleaned = clean_attributes(raw_html)
128139
try:
129-
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
140+
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
130141
return cleaned
131-
except Exception: #FIXME find the equivalent lxml error
132-
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
142+
except Exception: # FIXME find the equivalent lxml error
143+
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
133144
return raw_html

0 commit comments

Comments
 (0)
0