@@ -76,13 +76,23 @@ def clean(text):
76
76
def text_length (i ):
77
77
return len (clean (i .text_content () or "" ))
78
78
79
+ regexp_type = type (re .compile ('hello, world' ))
80
+
81
+ def compile_pattern (elements ):
82
+ if not elements :
83
+ return None
84
+ if isinstance (elements , regexp_type ):
85
+ return elements
86
+ if isinstance (elements , basestring ):
87
+ elements = elements .split (',' )
88
+ return re .compile (u'|' .join ([re .escape (x .lower ()) for x in elements ]), re .U )
79
89
80
90
class Document :
81
91
"""Class to build a etree document out of html."""
82
92
TEXT_LENGTH_THRESHOLD = 25
83
93
RETRY_LENGTH = 250
84
94
85
- def __init__ (self , input , ** options ):
95
+ def __init__ (self , input , positive_keywords = None , negative_keywords = None , ** options ):
86
96
"""Generate the document
87
97
88
98
:param input: string of the html content.
@@ -93,19 +103,24 @@ def __init__(self, input, **options):
93
103
- min_text_length:
94
104
- retry_length:
95
105
- url: will allow adjusting links to be absolute
96
-
106
+ - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"]
107
+ - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"]
108
+ Also positive_keywords and negative_keywords could be a regexp.
97
109
"""
98
110
self .input = input
99
111
self .options = options
100
112
self .html = None
113
+ self .encoding = None
114
+ self .positive_keywords = compile_pattern (positive_keywords )
115
+ self .negative_keywords = compile_pattern (negative_keywords )
101
116
102
117
def _html (self , force = False ):
103
118
if force or self .html is None :
104
119
self .html = self ._parse (self .input )
105
120
return self .html
106
121
107
122
def _parse (self , input ):
108
- doc = build_doc (input )
123
+ doc , self . encoding = build_doc (input )
109
124
doc = html_cleaner .clean_html (doc )
110
125
base_href = self .options .get ('url' , None )
111
126
if base_href :
@@ -311,19 +326,25 @@ def score_paragraphs(self, ):
311
326
312
327
def class_weight (self , e ):
313
328
weight = 0
314
- if e .get ('class' , None ):
315
- if REGEXES ['negativeRe' ].search (e .get ('class' )):
316
- weight -= 25
329
+ for feature in [e .get ('class' , None ), e .get ('id' , None )]:
330
+ if feature :
331
+ if REGEXES ['negativeRe' ].search (feature ):
332
+ weight -= 25
333
+
334
+ if REGEXES ['positiveRe' ].search (feature ):
335
+ weight += 25
336
+
337
+ if self .positive_keywords and self .positive_keywords .search (feature ):
338
+ weight += 25
317
339
318
- if REGEXES [ 'positiveRe' ]. search (e . get ( 'class' ) ):
319
- weight + = 25
340
+ if self . negative_keywords and self . negative_keywords . search (feature ):
341
+ weight - = 25
320
342
321
- if e .get ('id' , None ):
322
- if REGEXES ['negativeRe' ].search (e .get ('id' )):
323
- weight -= 25
343
+ if self .positive_keywords and self .positive_keywords .match ('tag-' + e .tag ):
344
+ weight += 25
324
345
325
- if REGEXES [ 'positiveRe' ]. search ( e . get ( 'id' ) ):
326
- weight + = 25
346
+ if self . negative_keywords and self . negative_keywords . match ( 'tag-' + e . tag ):
347
+ weight - = 25
327
348
328
349
return weight
329
350
@@ -569,6 +590,8 @@ def main():
569
590
parser = OptionParser (usage = "%prog: [options] [file]" )
570
591
parser .add_option ('-v' , '--verbose' , action = 'store_true' )
571
592
parser .add_option ('-u' , '--url' , default = None , help = "use URL instead of a local file" )
593
+ parser .add_option ('-p' , '--positive-keywords' , default = None , help = "positive keywords (separated with comma)" , action = 'store' )
594
+ parser .add_option ('-n' , '--negative-keywords' , default = None , help = "negative keywords (separated with comma)" , action = 'store' )
572
595
(options , args ) = parser .parse_args ()
573
596
574
597
if not (len (args ) == 1 or options .url ):
@@ -581,11 +604,14 @@ def main():
581
604
file = urllib .urlopen (options .url )
582
605
else :
583
606
file = open (args [0 ], 'rt' )
584
- enc = sys .__stdout__ .encoding or 'utf-8'
607
+ enc = sys .__stdout__ .encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING
585
608
try :
586
609
print Document (file .read (),
587
610
debug = options .verbose ,
588
- url = options .url ).summary ().encode (enc , 'replace' )
611
+ url = options .url ,
612
+ positive_keywords = options .positive_keywords ,
613
+ negative_keywords = options .negative_keywords ,
614
+ ).summary ().encode (enc , 'replace' )
589
615
finally :
590
616
file .close ()
591
617
0 commit comments