Harry0201
diff --git a/‎src/readability_lxml/readability.py
Lines changed: 41 additions & 19 deletions b/‎src/readability_lxml/readability.py
Lines changed: 41 additions & 19 deletions
@@ -108,6 +108,7 @@ def tags(node, *tag_names):
         for e in node.findall('.//%s' % tag_name):
             yield e
 
+
 def class_weight(e):
     weight = 0
     if e.get('class', None):
@@ -126,6 +127,7 @@ def class_weight(e):
 
     return weight
 
+
 def score_node(elem):
     content_score = class_weight(elem)
     name = elem.tag.lower()
@@ -146,7 +148,8 @@ def score_node(elem):
 def transform_misused_divs_into_paragraphs(doc):
     for elem in tags(doc, 'div'):
         # transform <div>s that do not contain other block elements into <p>s
-        if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+        if not REGEXES['divToPElementsRe'].search(
+            unicode(''.join(map(tostring, list(elem))))):
             logging.debug("Altering %s to p" % (describe(elem)))
             elem.tag = "p"
             #print "Fixed element "+describe(elem)
@@ -166,12 +169,15 @@ def transform_misused_divs_into_paragraphs(doc):
                 p.text = child.tail
                 child.tail = None
                 elem.insert(pos + 1, p)
-                logging.debug("Inserted %s to %s" % (tounicode(p), describe(elem)))
+                logging.debug("Inserted %s to %s" % (
+                    tounicode(p),
+                    describe(elem)))
                 #print "Inserted "+tounicode(p)+" to "+describe(elem)
             if child.tag == 'br':
                 #print 'Dropped <br> at '+describe(elem)
                 child.drop_tree()
 
+
 def remove_unlikely_candidates(doc):
     for elem in doc.iter():
         s = "%s %s" % (elem.get('class', ''), elem.get('id', ''))
@@ -184,6 +190,7 @@ def remove_unlikely_candidates(doc):
             logging.debug("Removing unlikely candidate - %s" % describe(elem))
             elem.drop_tree()
 
+
 def get_link_density(elem):
     link_length = 0
     for i in elem.findall(".//a"):
@@ -232,17 +239,23 @@ def score_paragraphs(doc, min_text_len):
         if grand_parent_node is not None:
             candidates[grand_parent_node]['content_score'] += content_score / 2.0
 
-    # Scale the final candidates score based on link density. Good content should have a
-    # relatively small link density (5% or less) and be mostly unaffected by this operation.
+    # Scale the final candidates score based on link density. Good content
+    # should have a relatively small link density (5% or less) and be mostly
+    # unaffected by this operation.
     for elem in ordered:
         candidate = candidates[elem]
         ld = get_link_density(elem)
         score = candidate['content_score']
-        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
+        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (
+            score,
+            describe(elem),
+            ld,
+            score * (1 - ld)))
         candidate['content_score'] *= (1 - ld)
 
     return candidates
 
+
 def select_best_candidate(candidates):
     sorted_candidates = sorted(candidates.values(),
         key=lambda x: x['content_score'],
@@ -266,6 +279,7 @@ def reverse_tags(node, *tag_names):
         for e in reversed(node.findall('.//%s' % tag_name)):
             yield e
 
+
 def sanitize(node, candidates, min_text_len):
     for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
         if class_weight(header) < 0 or get_link_density(header) > 0.33:
@@ -293,10 +307,11 @@ def sanitize(node, candidates, min_text_len):
         elif el.text_content().count(",") < 10:
             counts = {}
             for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
-                counts[kind] = len(el.findall('.//%s' %kind))
+                counts[kind] = len(el.findall('.//%s' % kind))
             counts["li"] -= 100
 
-            content_length = text_length(el) # Count the text length excluding any surrounding whitespace
+            # Count the text length excluding any surrounding whitespace
+            content_length = text_length(el)
             link_density = get_link_density(el)
             parent_node = el.getparent()
             if parent_node is not None:
@@ -347,26 +362,26 @@ def sanitize(node, candidates, min_text_len):
 
                 #find x non empty preceding and succeeding siblings
                 i, j = 0, 0
-                x  = 1
+                x = 1
                 siblings = []
                 for sib in el.itersiblings():
                     #logging.debug(sib.text_content())
                     sib_content_length = text_length(sib)
                     if sib_content_length:
-                        i =+ 1
+                        i += 1
                         siblings.append(sib_content_length)
                         if i == x:
                             break
                 for sib in el.itersiblings(preceding=True):
                     #logging.debug(sib.text_content())
                     sib_content_length = text_length(sib)
                     if sib_content_length:
-                        j =+ 1
+                        j += 1
                         siblings.append(sib_content_length)
                         if j == x:
                             break
                 #logging.debug(str(siblings))
-                if siblings and sum(siblings) > 1000 :
+                if siblings and sum(siblings) > 1000:
                     to_remove = False
                     logging.debug("Allowing %s" % describe(el))
                     for desnode in tags(el, "table", "ul", "div"):
@@ -388,21 +403,22 @@ def sanitize(node, candidates, min_text_len):
 
 
 def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
-    # Now that we have the top candidate, look through its siblings for content that might also be related.
-    # Things like preambles, content split by ads that we removed, etc.
-
+    # Now that we have the top candidate, look through its siblings for
+    # content that might also be related. Things like preambles, content
+    # split by ads that we removed, etc.
     sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
     if enclose_with_html_tag:
         output = document_fromstring('<div/>')
     else:
         output = fragment_fromstring('<div/>')
     best_elem = best_candidate['elem']
     for sibling in best_elem.getparent().getchildren():
-        #if isinstance(sibling, NavigableString): continue#in lxml there no concept of simple text
+        #if isinstance(sibling, NavigableString): continue#in lxml there no
+        # concept of simple text
         append = False
         if sibling is best_elem:
             append = True
-        sibling_key = sibling #HashableElement(sibling)
+        sibling_key = sibling  # HashableElement(sibling)
 
         # Print out sibling information for debugging.
         if sibling_key in candidates:
@@ -476,16 +492,15 @@ def get_article(doc, min_text_len, retry_len, enclose_with_html_tag=True):
             of_acceptable_length = len(cleaned_article or '') >= retry_len
             if ruthless and not of_acceptable_length:
                 ruthless = False
-                continue # try again
+                continue  # try again
             else:
                 return Summary(confidence=confidence,
                     html=cleaned_article,
                     short_title=shorten_title(doc),
                     title=get_title(doc))
 
     except StandardError as e:
-        #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
-        logging.exception('error getting summary: ' )
+        logging.exception('error getting summary: ')
         raise Unparseable(str(e)), None, sys.exc_info()[2]
 
 
@@ -533,6 +548,7 @@ def clean_segment_number(segments, index, segment):
     else:
         return segment
 
+
 def clean_segment_index(segments, index, segment):
     if index == (len(segments) - 1) and segment.lower() == 'index':
         return None
@@ -555,6 +571,7 @@ def clean_segment_short(segments, index, segment):
     else:
         return segment
 
+
 def clean_segment(segments, index, segment):
     """
     Cleans a single segment of a URL to find the base URL.  The base URL is as
@@ -613,6 +630,7 @@ def __init__(self, link_text, href):
         self.href = href
         self.score = 0
 
+
 def same_domain(lhs, rhs):
     split_lhs = urlparse.urlsplit(lhs)
     split_rhs = urlparse.urlsplit(rhs)
@@ -625,6 +643,7 @@ def same_domain(lhs, rhs):
 def strip_trailing_slash(s):
     return re.sub(r'/$', '', s)
 
+
 def eval_href(parsed_urls, url, base_url, link):
     raw_href = link.get('href')
     if raw_href is None:
@@ -644,13 +663,15 @@ def eval_href(parsed_urls, url, base_url, link):
 
     return raw_href, href, True
 
+
 def eval_link_text(link):
     link_text = clean(link.text_content() or '')
     if REGEXES['extraneous'].search(link_text) or len(link_text) > 25:
         return link_text, False
     else:
         return link_text, True
 
+
 def find_or_create_page(candidates, href, link_text):
     '''
     Finds or creates a candidate page object for a next-page href.  If one
@@ -666,6 +687,7 @@ def find_or_create_page(candidates, href, link_text):
         candidates[href] = candidate
         return candidate, True
 
+
 def eval_possible_next_page_link(
             parsed_urls, url, base_url, candidates, link):