8000 Merge pull request #135 from adbar/master · feedly/python-readability@07f6861 · GitHub
[go: up one dir, main page]

Skip to content
This repository was archived by the owner on Jan 4, 2022. It is now read-only.

Commit 07f6861

Browse files
authored
Merge pull request buriy#135 from adbar/master
unnecessary imports removed added lines for conformity and readability linted code parts
2 parents 17ffad5 + bd8293e commit 07f6861

File tree

4 files changed

+22
-14
lines changed

4 files changed

+22
-14
lines changed

readability/cleaners.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,21 @@
1414
">" # end
1515
, re.I)
1616

17+
1718
def clean_attributes(html):
1819
while htmlstrip.search(html):
1920
html = htmlstrip.sub('<\\1\\2>', html)
2021
return html
2122

23+
2224
def normalize_spaces(s):
2325
if not s:
2426
return ''
2527
"""replace any sequence of whitespace
2628
characters with a single space"""
2729
return ' '.join(s.split())
2830

31+
2932
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
3033
style=True, links=True, meta=False, add_nofollow=False,
3134
page_structure=False, processing_instructions=True, embedded=False,

readability/debug.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,3 @@ def text_content(elem, length=40):
4949
if len(content) < length:
5050
return content
5151
return content[:length] + '...'
52-
53-

readability/htmls.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,31 @@
11
from lxml.html import tostring
2-
import logging
32
import lxml.html
4-
import re, sys
3+
import re
54

65
from .cleaners import normalize_spaces, clean_attributes
76
from .encoding import get_encoding
87
from .compat import str_
98

109
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
1110

11+
1212
def build_doc(page):
1313
if isinstance(page, str_):
1414
encoding = None
1515
decoded_page = page
1616
else:
1717
encoding = get_encoding(page) or 'utf-8'
1818
decoded_page = page.decode(encoding, 'replace')
19-
19+
2020
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
2121
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
2222
return doc, encoding
2323

24+
2425
def js_re(src, pattern, flags, repl):
2526
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
2627

28+
2729
def normalize_entities(cur_title):
2830
entities = {
2931
u'\u2014':'-',
@@ -41,26 +43,31 @@ def normalize_entities(cur_title):
4143

4244
return cur_title
4345

46+
4447
def norm_title(title):
4548
return normalize_entities(normalize_spaces(title))
4649

50+
4751
def get_title(doc):
4852
title = doc.find('.//title')
4953
if title is None or title.text is None or len(title.text) == 0:
5054
return '[no-title]'
5155

5256
return norm_title(title.text)
5357

58+
5459
def add_match(collection, text, orig):
5560
text = norm_title(text)
5661
if len(text.split()) >= 2 and len(text) >= 15:
5762
if text.replace('"', '') in orig.replace('"', ''):
5863
collection.add(text)
5964

65+
6066
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
6167
'.news_title', '.title', '.head', '.heading',
6268
'.contentheading', '.small_header_red']
6369

70+
6471
def shorten_title(doc):
6572
title = doc.find('.//title')
6673
if title is None or title.text is None or len(title.text) == 0:
@@ -109,6 +116,8 @@ def shorten_title(doc):
109116

110117
return title
111118

119+
120+
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
112121
def get_body(doc):
113122
for elem in doc.xpath('.//script | .//link | .//style'):
114123
elem.drop_tree()

readability/readability.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import re
55
import sys
66

7-
from collections import defaultdict
87
from lxml.etree import tostring
98
from lxml.etree import tounicode
109
from lxml.html import document_fromstring
@@ -56,7 +55,6 @@ def to_int(x):
5655
def clean(text):
5756
# Many spaces make the following regexes run forever
5857
text = re.sub(r'\s{255,}', ' ' * 255, text)
59-
6058
text = re.sub(r'\s*\n\s*', '\n', text)
6159
text = re.sub(r'\t|[ \t]{2,}', ' ', text)
6260
return text.strip()
@@ -65,12 +63,11 @@ def clean(text):
6563
def text_length(i):
6664
return len(clean(i.text_content() or ""))
6765

68-
regexp_type = type(re.compile('hello, world'))
6966

7067
def compile_pattern(elements):
7168
if not elements:
7269
return None
73-
elif isinstance(elements, regexp_type):
70+
elif isinstance(elements, re._pattern_type):
7471
return elements
7572
elif isinstance(elements, (str_, bytes_)):
7673
if isinstance(elements, bytes_):
@@ -82,6 +79,7 @@ def compile_pattern(elements):
8279
raise Exception("Unknown type for the pattern: {}".format(type(elements)))
8380
# assume string or string like object
8481

82+
8583
class Document:
8684
"""Class to build a etree document out of html."""
8785

@@ -98,9 +96,9 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None,
9896
:param xpath: If set to True, adds x="..." attribute to each HTML node,
9997
containing xpath path pointing to original document path (allows to
10098
reconstruct selected summary in original document).
101-
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
99+
:param handle_failures: Parameter passed to `lxml` for handling failure during exception.
102100
Support options = ["discard", "ignore", None]
103-
101+
104102
Examples:
105103
positive_keywords=["news-item", "block"]
106104
positive_keywords=["news-item, block"]
@@ -290,7 +288,7 @@ def select_best_candidate(self, candidates):
290288
return None
291289

292290
sorted_candidates = sorted(
293-
candidates.values(),
291+
candidates.values(),
294292
key=lambda x: x['content_score'],
295293
reverse=True
296294
)
@@ -517,10 +515,10 @@ def sanitize(self, node, candidates):
517515

518516
#if el.tag == 'div' and counts["img"] >= 1:
519517
# continue
520-
if counts["p"] and counts["img"] > 1+counts["p"]*1.3:
518+
if counts["p"] and counts["img"] > 1 + counts["p"]*1.3:
521519
reason = "too many images (%s)" % counts["img"]
522520
to_remove = True
523-
elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
521+
elif counts["li"] > counts["p"] and tag not in ("ol", "ul"):
524522
reason = "more <li>s than <p>s"
525523
to_remove = True
526524
elif counts["input"] > (counts["p"] / 3):

0 commit comments

Comments
 (0)
0