8000 Switched to use python logging module. · appscluster/python-readability@ae1f1ad · GitHub
[go: up one dir, main page]

Skip to content

Commit ae1f1ad

Browse files
committed
Switched to use python logging module.
Added xpath option (undocumented yet).
1 parent 2fab5ff commit ae1f1ad

File tree

5 files changed

+214
-137
lines changed

5 files changed

+214
-137
lines changed

readability/debug.py

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,62 @@
1-
def save_to_file(text, filename):
2-
f = open(filename, 'wt')
3-
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
4-
f.write(text.encode('utf-8'))
5-
f.close()
1+
import re
62

7-
uids = {}
8-
def describe(node, depth=2):
3+
4+
uids = {}
5+
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
6+
7+
8+
def open_in_browser(html):
9+
"""
10+
Open the HTML document in a web browser, saving it to a temporary
11+
file to open it. Note that this does not delete the file after
12+
use. This is mainly meant for debugging.
13+
"""
14+
import os
15+
import webbrowser
16+
import tempfile
17+
handle, fn = tempfile.mkstemp(suffix='.html')
18+
f = os.fdopen(handle, 'wb')
19+
try:
20+
f.write("<meta charset='UTF-8' />")
21+
f.write(html.encode('utf-8'))
22+
finally:
23+
# we leak the file itself here, but we should at least close it
24+
f.close()
25+
url = 'file://' + fn.replace(os.path.sep, '/')
26+
webbrowser.open(url)
27+
return url
28+
29+
30+
def describe_node(node):
31+
if node is None:
32+
return ''
933
if not hasattr(node, 'tag'):
1034
return "[%s]" % type(node)
1135
name = node.tag
12-
if node.get('id', ''): name += '#'+node.get('id')
13-
if node.get('class', ''):
14-
name += '.' + node.get('class').replace(' ','.')
36+
if node.get('id', ''):
37+
name += '#' + node.get('id')
38+
if node.get('class', ''):
39+
name += '.' + node.get('class').replace(' ', '.')
1540
if name[:4] in ['div#', 'div.']:
1641
name = name[3:]
1742
if name in ['tr', 'td', 'div', 'p']:
18-
if not node in uids:
19-
uid = uids[node] = len(uids)+1
20-
else:
21-
uid = uids.get(node)
22-
name += "%02d" % (uid)
23-
if depth and node.getparent() is not None:
24-
return name+' - '+describe(node.getparent(), depth-1)
43+
uid = uids.get(node)
44+
if uid is None:
45+
uid = uids[node] = len(uids) + 1
46+
name += "{%02d}" % uid
2547
return name
48+
49+
50+
def describe(node, depth=2):
51+
#return repr(NodeRepr(node))
52+
parent = ''
53+
if depth and node.getparent() is not None:
54+
parent = describe(node.getparent(), depth=depth - 1)
55+
return parent + '/' + describe_node(node)
56+
57+
58+
def text_content(elem, length=40):
59+
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
60+
if len(content) < length:
61+
return content
62+
return content[:length] + '...'

readability/encoding.py

Lines changed: 46 additions & 36 deletions
10000
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,58 @@
11
import re
22
import chardet
3+
import logging
4+
5+
log = logging.getLogger('readbility.encoding')
6+
7+
8+
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', re.I)
9+
RE_PRAGMA = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', re.I)
10+
RE_XML = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
11+
12+
CHARSETS = {
13+
'big5': 'big5hkscs',
14+
'gb2312': 'gb18030',
15+
'ascii': 'utf-8',
16+
'MacCyrillic': 'cp1251',
17+
}
18+
19+
20+
def fix_charset(encoding):
21+
"""Overrides encoding when charset declaration
22+
or charset determination is a subset of a larger
23+
charset. Created because of issues with Chinese websites"""
24+
encoding = encoding.lower()
25+
return CHARSETS.get(encoding, encoding)
26+
327

428
def get_encoding(page):
5-
# Regex for XML and HTML Meta charset declaration
6-
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
7-
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
8-
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
9-
10-
declared_encodings = (charset_re.findall(page) +
11-
pragma_re.findall(page) +
12-
xml_re.findall(page))
13-
14-
# Try any declared encodings
15-
if len(declared_encodings) > 0:
16-
for declared_encoding in declared_encodings:
17-
try:
18-
page.decode(custom_decode(declared_encoding))
19-
return custom_decode(declared_encoding)
20-
except UnicodeDecodeError:
21-
pass
29+
declared_encodings = (RE_CHARSET.findall(page) +
30+
RE_PRAGMA.findall(page) +
31+
RE_XML.findall(page))
32+
33+
log.debug("Document has the following encodings: %s" % declared_encodings)
34+
35+
# Try declared encodings, if any
36+
for declared_encoding in declared_encodings:
37+
encoding = fix_charset(declared_encoding)
38+
try:
39+
page.decode(encoding)
40+
log.info('Using encoding "%s"' % encoding)
41+
return encoding
42+
except UnicodeDecodeError:
43+
log.info('Encoding "%s", specified in the document as "%s" '
44+
'didn\'t work' % (encoding, declared_encoding))
45+
print "Content encoding didn't work:", encoding
2246

2347
# Fallback to chardet if declared encodings fail
2448
text = re.sub('</?[^>]*>\s*', ' ', page)
2549
enc = 'utf-8'
2650
if not text.strip() or len(text) < 10:
27-
return enc # can't guess
51+
log.debug("Can't guess encoding because text is too short")
52+
return enc
2853
res = chardet.detect(text)
29-
enc = res['encoding']
54+
enc = fix_charset(res['encoding'])
55+
log.info('Trying encoding "%s" guessed '
56+
'with confidence %.2f' % (enc, res['confidence']))
3057
#print '->', enc, "%.2f" % res['confidence']
31-
enc = custom_decode(enc)
3258
return enc
33-
34-
def custom_decode(encoding):
35-
"""Overrides encoding when charset declaration
36-
or charset determination is a subset of a larger
37-
charset. Created because of issues with Chinese websites"""
38-
encoding = encoding.lower()
39-
alternates = {
40-
'big5': 'big5hkscs',
41-
'gb2312': 'gb18030',
42-
'ascii': 'utf-8',
43-
'MacCyrillic': 'cp1251',
44- }
45-
if encoding in alternates:
46-
return alternates[encoding]
47-
else:
48-
return encoding

readability/htmls.py

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,36 @@
33
from lxml.html import tostring
44
import logging
55
import lxml.html
6-
import re, sys
6+
import re
7+
8+
log = logging.getLogger('readability.htmls')
79

810
utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
911

12+
13+
def lxml_fromstring(doc):
14+
return lxml.html.document_fromstring(doc, parser=utf8_parser)
15+
16+
1017
def build_doc(page):
1118
if isinstance(page, unicode):
1219
enc = None
13-
page_unicode = page
20+
unicode_page = page
1421
else:
1522
enc = get_encoding(page) or 'utf-8'
16-
page_unicode = page.decode(enc, 'replace')
17-
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
23+
unicode_page = page.decode(enc, 'replace')
24+
doc = lxml_fromstring(unicode_page.encode('utf-8', 'replace').replace('\r', ''))
1825
return doc, enc
1926

27+
2028
def js_re(src, pattern, flags, repl):
2129
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
2230

2331

2432
def normalize_entities(cur_title):
2533
entities = {
26-
u'\u2014':'-',
27-
u'\u2013':'-',
34+
u'\u2014': '-',
35+
u'\u2013': '-',
2836
u'&mdash;': '-',
2937
u'&ndash;': '-',
3038
u'\u00A0': ' ',
@@ -38,22 +46,31 @@ def normalize_entities(cur_title):
3846

3947
return cur_title
4048

49+
4150
def norm_title(title):
4251
return normalize_entities(normalize_spaces(title))
4352

53+
4454
def get_title(doc):
4555
title = doc.find('.//title')
4656
if title is None or len(title.text) == 0:
4757
return '[no-title]'
4858

4959
return norm_title(title.text)
5060

61+
5162
def add_match(collection, text, orig):
5263
text = norm_title(text)
5364
if len(text.split()) >= 2 and len(text) >= 15:
5465
if text.replace('"', '') in orig.replace('"', ''):
5566
collection.add(text)
5667

68+
69+
TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
70+
'.news_title', '.title', '.head', '.heading',
71+
'.contentheading', '.small_header_red']
72+
73+
5774
def shorten_title(doc):
5875
title = doc.find('.//title')
5976
if title is None or title.text is None or len(title.text) == 0:
@@ -70,7 +87,7 @@ def shorten_title(doc):
7087
if e.text_content():
7188
add_match(candidates, e.text_content(), orig)
7289

73-
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
90+
for item in TITLE_CSS_HEURISTICS:
7491
for e in doc.cssselect(item):
7592
if e.text:
7693
add_match(candidates, e.text, orig)
@@ -102,13 +119,16 @@ def shorten_title(doc):
102119

103120
return title
104121

122+
105123
def get_body(doc):
106-
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
124+
for elem in doc.xpath('.//script | .//link | .//style'):
125+
elem.drop_tree()
107126
raw_html = unicode(tostring(doc.body or doc))
108127
cleaned = clean_attributes(raw_html)
109128
try:
110129
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
111130
return cleaned
112-
except Exception: #FIXME find the equivalent lxml error
113-
#logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
131+
except Exception: # FIXME find the equivalent lxml error
132+
log.error("cleaning broken html content: "
133+
"%s\n---------\n%s" % (raw_html, cleaned))
114134
return raw_html

0 commit comments

Comments
 (0)
0