|
1 | 1 | #!/usr/bin/env python
|
| 2 | +from __future__ import print_function |
2 | 3 | import logging
|
3 | 4 | import re
|
4 | 5 | import sys
|
|
20 | 21 | logging.basicConfig(level=logging.INFO)
|
21 | 22 | log = logging.getLogger()
|
22 | 23 |
|
| 24 | +if sys.version_info[0] == 2: |
| 25 | + str = unicode |
23 | 26 |
|
24 | 27 | REGEXES = {
|
25 | 28 | 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I),
|
@@ -81,11 +84,12 @@ def text_length(i):
|
81 | 84 | def compile_pattern(elements):
|
82 | 85 | if not elements:
|
83 | 86 | return None
|
84 |
| - if isinstance(elements, regexp_type): |
| 87 | + elif isinstance(elements, regexp_type): |
85 | 88 | return elements
|
86 |
| - if isinstance(elements, basestring): |
| 89 | + else: |
| 90 | + # assume string or string like object |
87 | 91 | elements = elements.split(',')
|
88 |
| - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) |
| 92 | + return re.compile('|'.join([re.escape(x.lower()) for x in elements]), re.U) |
89 | 93 |
|
90 | 94 | class Document:
|
91 | 95 | """Class to build a etree document out of html."""
|
@@ -195,9 +199,20 @@ def summary(self, html_partial=False):
|
195 | 199 | continue
|
196 | 200 | else:
|
197 | 201 | return cleaned_article
|
198 |
| - except StandardError, e: |
| 202 | + except Exception as e: |
199 | 203 | log.exception('error getting summary: ')
|
200 |
| - raise Unparseable(str(e)), None, sys.exc_info()[2] |
| 204 | + if sys.version_info[0] == 2: |
| 205 | + # This is the only reason why we can't support Python 3.3: |
| 206 | + # 3.3s parser fails to accept the old syntax (although this |
| 207 | + # code never runs) which would require write this line as: |
| 208 | + # write this line as |
| 209 | + # Unparseable(str(e)) |
| 210 | + # but then we loose the traceback information. 3.4 on the |
| 211 | + # other hand accepts the old syntax and would only complain |
| 212 | + # at runtime. |
| 213 | + raise Unparseable(str(e)), None, sys.exc_info()[2] |
| 214 | + else: |
| 215 | + raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) |
201 | 216 |
|
202 | 217 | def get_article(self, candidates, best_candidate, html_partial=False):
|
203 | 218 | # Now that we have the top candidate, look through its siblings for
|
@@ -247,7 +262,7 @@ def get_article(self, candidates, best_candidate, html_partial=False):
|
247 | 262 | return output
|
248 | 263 |
|
249 | 264 | def select_best_candidate(self, candidates):
|
250 |
| - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) |
| 265 | + sorted_candidates = sorted(list(candidates.values()), key=lambda x: x['content_score'], reverse=True) |
251 | 266 | for candidate in sorted_candidates[:5]:
|
252 | 267 | elem = candidate['elem']
|
253 | 268 | self.debug("Top 5 : %6.3f %s" % (
|
@@ -388,7 +403,7 @@ def transform_misused_divs_into_paragraphs(self):
|
388 | 403 | # This results in incorrect results in case there is an <img>
|
389 | 404 | # buried within an <a> for example
|
390 | 405 | if not REGEXES['divToPElementsRe'].search(
|
391 |
| - unicode(''.join(map(tostring, list(elem))))): |
| 406 | + str(''.join(map(str, map(tostring, list(elem)))))): |
392 | 407 | #self.debug("Altering %s to p" % (describe(elem)))
|
393 | 408 | elem.tag = "p"
|
394 | 409 | #print "Fixed element "+describe(elem)
|
@@ -609,18 +624,18 @@ def main():
|
609 | 624 |
|
610 | 625 | file = None
|
611 | 626 | if options.url:
|
612 |
| - import urllib |
613 |
| - file = urllib.urlopen(options.url) |
| 627 | + import urllib.request, urllib.parse, urllib.error |
| 628 | + file = urllib.request.urlopen(options.url) |
614 | 629 | else:
|
615 | 630 | file = open(args[0], 'rt')
|
616 | 631 | enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
|
617 | 632 | try:
|
618 |
| - print Document(file.read(), |
| 633 | + print(Document(file.read(), |
619 | 634 | debug=options.verbose,
|
620 | 635 | url=options.url,
|
621 | 636 | positive_keywords = options.positive_keywords,
|
622 | 637 | negative_keywords = options.negative_keywords,
|
623 |
| - ).summary().encode(enc, 'replace') |
| 638 | + ).summary().encode(enc, 'replace')) |
624 | 639 | finally:
|
625 | 640 | file.close()
|
626 | 641 |
|
|
0 commit comments