Harry0201
diff --git a/‎README
Lines changed: 8 additions & 0 deletions b/‎README
Lines changed: 8 additions & 0 deletions
diff --git a/‎readability/htmls.py
Lines changed: 14 additions & 6 deletions b/‎readability/htmls.py
Lines changed: 14 additions & 6 deletions
diff --git a/‎readability/readability.py
Lines changed: 41 additions & 15 deletions b/‎readability/readability.py
Lines changed: 41 additions & 15 deletions
diff --git a/‎setup.py
Lines changed: 8 additions & 2 deletions b/‎setup.py
Lines changed: 8 additions & 2 deletions
@@ -35,17 +35,25 @@ Command-line usage::
     python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
 
 
+Using positive/negative keywords example::
+
+    python -m readability.readability -p intro -n newsindex,homepage-box,news-section -u http://python.org
+
+
 Document() kwarg options:
 
  - attributes:
  - debug: output debug messages
  - min_text_length:
  - retry_length:
  - url: will allow adjusting links to be absolute
+ - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
+ - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
 
 
 Updates
 
  - 0.2.5 Update setup.py for uploading .tar.gz to pypi
  - 0.2.6 Don't crash on documents with no title
  - 0.2.6.1 Document.short_title() properly works
+ - 0.3 Added Document.encoding, positive_keywords and negative_keywords
@@ -3,9 +3,7 @@
 from lxml.html import tostring
 import logging
 import lxml.html
-import re
-
-logging.getLogger().setLevel(logging.DEBUG)
+import re, sys
 
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
@@ -14,9 +12,19 @@ def build_doc(page):
         page_unicode = page
     else:
         enc = get_encoding(page)
-        page_unicode = page.decode(enc, 'replace')
+        if enc:
+            page_unicode = page.decode(enc, 'replace')
+            encoding = enc
+        else:
+            try:
+                #try utf-8
+                page_unicode = page.decode('utf-8', 'strict')
+                encoding = 'utf-8'
+            except UnicodeDecodeError:
+                page_unicode = page.decode('utf-8', 'replace')
+                encoding = 'utf-8'
     doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
-    return doc
+    return doc, encoding
 
 def js_re(src, pattern, flags, repl):
     return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
@@ -111,5 +119,5 @@ def get_body(doc):
         #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
         return cleaned
     except Exception: #FIXME find the equivalent lxml error
-        logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
+        #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
         return raw_html
@@ -76,13 +76,23 @@ def clean(text):
 def text_length(i):
     return len(clean(i.text_content() or ""))
 
+regexp_type = type(re.compile('hello, world'))
+
+def compile_pattern(elements):
+    if not elements:
+        return None
+    if isinstance(elements, regexp_type):
+        return elements
+    if isinstance(elements, basestring):
+        elements = elements.split(',')
+    return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U)
 
 class Document:
     """Class to build a etree document out of html."""
     TEXT_LENGTH_THRESHOLD = 25
     RETRY_LENGTH = 250
 
-    def __init__(self, input, **options):
+    def __init__(self, input, positive_keywords=None, negative_keywords=None, **options):
         """Generate the document
 
         :param input: string of the html content.
@@ -93,19 +103,24 @@ def __init__(self, input, **options):
             - min_text_length:
             - retry_length:
             - url: will allow adjusting links to be absolute
-
+            - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
+            - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
+            Also positive_keywords and negative_keywords could be a regexp.
         """
         self.input = input
         self.options = options
         self.html = None
+        self.encoding = None
+        self.positive_keywords = compile_pattern(positive_keywords)
+        self.negative_keywords = compile_pattern(negative_keywords)
 
     def _html(self, force=False):
         if force or self.html is None:
             self.html = self._parse(self.input)
         return self.html
 
     def _parse(self, input):
-        doc = build_doc(input)
+        doc, self.encoding = build_doc(input)
         doc = html_cleaner.clean_html(doc)
         base_href = self.options.get('url', None)
         if base_href:
@@ -311,19 +326,25 @@ def score_paragraphs(self, ):
 
     def class_weight(self, e):
         weight = 0
-        if e.get('class', None):
-            if REGEXES['negativeRe'].search(e.get('class')):
-                weight -= 25
+        for feature in [e.get('class', None), e.get('id', None)]:
+            if feature:
+                if REGEXES['negativeRe'].search(feature):
+                    weight -= 25
+
+                if REGEXES['positiveRe'].search(feature):
+                    weight += 25
+
+                if self.positive_keywords and self.positive_keywords.search(feature):
+                    weight += 25
 
-            if REGEXES['positiveRe'].search(e.get('class')):
-                weight += 25
+                if self.negative_keywords and self.negative_keywords.search(feature):
+                    weight -= 25
 
-        if e.get('id', None):
-            if REGEXES['negativeRe'].search(e.get('id')):
-                weight -= 25
+        if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag):
+            weight += 25
 
-            if REGEXES['positiveRe'].search(e.get('id')):
-                weight += 25
+        if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag):
+            weight -= 25
 
         return weight
 
@@ -569,6 +590,8 @@ def main():
     parser = OptionParser(usage="%prog: [options] [file]")
     parser.add_option('-v', '--verbose', action='store_true')
     parser.add_option('-u', '--url', default=None, help="use URL instead of a local file")
+    parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store')
+    parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store')
     (options, args) = parser.parse_args()
 
     if not (len(args) == 1 or options.url):
@@ -581,11 +604,14 @@ def main():
         file = urllib.urlopen(options.url)
     else:
         file = open(args[0], 'rt')
-    enc = sys.__stdout__.encoding or 'utf-8'
+    enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
     try:
         print Document(file.read(),
             debug=options.verbose,
-            url=options.url).summary().encode(enc, 'replace')
+            url=options.url,
+            positive_keywords = options.positive_keywords,
+            negative_keywords = options.negative_keywords,
+        ).summary().encode(enc, 'replace')
     finally:
         file.close()
 
 
@@ -1,9 +1,15 @@
 #!/usr/bin/env python
 from setuptools import setup, find_packages
+import sys
+
+if sys.platform == 'darwin':
+    lxml = "lxml<2.4"
+else:
+    lxml = "lxml"
 
 setup(
     name="readability-lxml",
-    version="0.2.6.1",
+    version="0.3",
     author="Yuri Baburov",
     author_email="burchik@gmail.com",
     description="fast python port of arc90's readability tool",
@@ -14,7 +20,7 @@
     packages=['readability'],
     install_requires=[
         "chardet",
-        "lxml"
+        lxml
         ],
     classifiers=[
         "Environment :: Web Environment",