8000 Released v 0.3, and uploaded to the pypi. · Harry0201/python-readability@08658d1 · GitHub
[go: up one dir, main page]

Skip to content

Commit 08658d1

Browse files
committed
Released v 0.3, and uploaded to the pypi.
1 parent 4e3192f commit 08658d1

File tree

4 files changed

+71
-23
lines changed

4 files changed

+71
-23
lines changed

README

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,25 @@ Command-line usage::
3535
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
3636

3737

38+
Using positive/negative keywords example::
39+
40+
python -m readability.readability -p intro -n newsindex,homepage-box,news-section -u http://python.org
41+
42+
3843
Document() kwarg options:
3944

4045
- attributes:
4146
- debug: output debug messages
4247
- min_text_length:
4348
- retry_length:
4449
- url: will allow adjusting links to be absolute
50+
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
51+
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
4552

4653

4754
Updates
4855

4956
- 0.2.5 Update setup.py for uploading .tar.gz to pypi
5057
- 0.2.6 Don't crash on documents with no title
5158
- 0.2.6.1 Document.short_title() properly works
59+
- 0.3 Added Document.encoding, positive_keywords and negative_keywords

readability/htmls.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@
33
from lxml.html import tostring
44
import logging
55
import lxml.html
6-
import re
7-
8-
logging.getLogger().setLevel(logging.DEBUG)
6+
import re, sys
97

108
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
119

@@ -14,9 +12,19 @@ def build_doc(page):
1412
page_unicode = page
1513
else:
1614
enc = get_encoding(page)
17-
page_unicode = page.decode(enc, 'replace')
15+
if enc:
16+
page_unicode = page.decode(enc, 'replace')
17+
encoding = enc
18+
else:
19+
try:
20+
#try utf-8
21+
page_unicode = page.decode('utf-8', 'strict')
22+
encoding = 'utf-8'
23+
except UnicodeDecodeError:
24+
page_unicode = page.decode('utf-8', 'replace')
25+
encoding = 'utf-8'
1826
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
19-
return doc
27+
return doc, encoding
2028

2129
def js_re(src, pattern, flags, repl):
2230
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@@ -111,5 +119,5 @@ def get_body(doc):
111119
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
112120
return cleaned
113121
except Exception: #FIXME find the equivalent lxml error
114-
logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
122+
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
115123
return raw_html

readability/readability.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,23 @@ def clean(text):
7676
def text_length(i):
7777
return len(clean(i.text_content() or ""))
7878

79+
regexp_type = type(re.compile('hello, world'))
80+
81+
def compile_pattern(elements):
82+
if not elements:
83+
return None
84+
if isinstance(elements, regexp_type):
85+
return elements
86+
if isinstance(elements, basestring):
87+
elements = elements.split(',')
88+
return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
7989

8090
class Document:
8191
"""Class to build a etree document out of html."""
8292
TEXT_LENGTH_THRESHOLD = 25
8393
RETRY_LENGTH = 250
8494

85-
def __init__(self, input, **options):
95+
def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
8696
"""Generate the document
8797
8898
:param input: string of the html content.
@@ -93,19 +103,24 @@ def __init__(self, input, **options):
93103
- min_text_length:
94104
- retry_length:
95105
- url: will allow adjusting links to be absolute
96-
106+
- positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
107+
- negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
108+
Also positive_keywords and negative_keywords could be a regexp.
97109
"""
98110
self.input = input
99111
self.options = options
100112
self.html = None
113+
self.encoding = None
114+
self.positive_keywords = compile_pattern(positive_keywords)
115+
self.negative_keywords = compile_pattern(negative_keywords)
101116

102117
def _html(self, force=False):
103118
if force or self.html is None:
104119
self.html = self._parse(self.input)
105120
return self.html
106121

107122
def _parse(self, input):
108-
doc = build_doc(input)
123+
doc, self.encoding = build_doc(input)
109124
doc = html_cleaner.clean_html(doc)
110125
base_href = self.options.get('url', None)
111126
if base_href:
@@ -311,19 +326,25 @@ def score_paragraphs(self, ):
311326

312327
def class_weight(self, e):
313328
weight = 0
314-
if e.get('class', None):
315-
if REGEXES['negativeRe'].search(e.get('class')):
316-
weight -= 25
329+
for feature in [e.get('class', None), e.get('id', None)]:
330+
if feature:
331+
if REGEXES['negativeRe'].search(feature):
332+
weight -= 25
333+
334+
if REGEXES['positiveRe'].search(feature):
335+
weight += 25
336+
337+
if self.positive_keywords and self.positive_keywords.search(feature):
338+
weight += 25
317339

318-
if REGEXES['positiveRe'].search(e.get('class')):
319-
weight += 25
340+
if self.negative_keywords and self.negative_keywords.search(feature):
341+
weight -= 25
320342

321-
if e.get('id', None):
322-
if REGEXES['negativeRe'].search(e.get('id')):
323-
weight -= 25
343+
if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
344+
weight += 25
324345

325-
if REGEXES['positiveRe'].search(e.get('id')):
326-
weight += 25
346+
if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
347+
weight -= 25
327348

328349
return weight
329350

@@ -569,6 +590,8 @@ def main():
569590
parser = OptionParser(usage="%prog: [options] [file]")
570591
parser.add_option('-v', '--verbose', action='store_true')
571592
parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
593+
parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
594+
parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
572595
(options, args) = parser.parse_args()
573596

574597
if not (len(args) == 1 or options.url):
@@ -581,11 +604,14 @@ def main():
581604
file = urllib.urlopen(options.url)
582605
else:
583606
file = open(args[0], 'rt')
584-
enc = sys.__stdout__.encoding or 'utf-8'
607+
enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
585608
try:
586609
print Document(file.read(),
587610
debug=options.verbose,
588-
url=options.url).summary().encode(enc, 'replace')
611+
url=options.url,
612+
positive_keywords = options.positive_keywords,
613+
negative_keywords = options.negative_keywords,
614+
).summary().encode(enc, 'replace')
589615
finally:
590616
file.close()
591617

setup.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
#!/usr/bin/env python
22
from setuptools import setup, find_packages
3+
import sys
4+
5+
if sys.platform == 'darwin':
6+
lxml = "lxml<2.4"
7+
else:
8+
lxml = "lxml"
39

410
setup(
511
name="readability-lxml",
6-
version="0.2.6.1",
12+
version="0.3",
713
author="Yuri Baburov",
814
author_email="burchik@gmail.com",
915
description="fast python port of arc90's readability tool",
@@ -14,7 +20,7 @@
1420
packages=['readability'],
1521
install_requires=[
1622
"chardet",
17-
"lxml"
23+
lxml
1824
],
1925
classifiers=[
2026
"Environment :: Web Environment",

0 commit comments

Comments
 (0)
0