|
4 | 4 | import sys
|
5 | 5 | import urlparse
|
6 | 6 |
|
7 |
| -from collections import defaultdict |
8 | 7 | from collections import namedtuple
|
9 | 8 | from lxml.etree import tostring
|
10 | 9 | from lxml.etree import tounicode
|
@@ -232,9 +231,11 @@ def same_domain(lhs, rhs):
|
232 | 231 | else:
|
233 | 232 | return split_lhs.netloc == split_rhs.netloc
|
234 | 233 |
|
| 234 | + |
235 | 235 | def strip_trailing_slash(s):
|
236 | 236 | return re.sub(r'/$', '', s)
|
237 | 237 |
|
| 238 | + |
238 | 239 | def eval_possible_next_page_link(
|
239 | 240 | parsed_urls,
|
240 | 241 | url,
|
@@ -336,6 +337,7 @@ def eval_possible_next_page_link(
|
336 | 337 | except ValueError as e:
|
337 | 338 | pass
|
338 | 339 |
|
| 340 | + |
339 | 341 | def find_next_page_link(parsed_urls, url, elem):
|
340 | 342 | links = tags(elem, 'a')
|
341 | 343 | base_url = find_base_url(url)
|
@@ -814,26 +816,6 @@ def sanitize(self, node, candidates):
|
814 | 816 | ' many <embed>s')
|
815 | 817 | to_remove = True
|
816 | 818 |
|
817 |
| - |
818 |
| -# if el.tag == 'div' and counts['img'] >= 1 and to_remove: |
819 |
| -# imgs = el.findall('.//img') |
820 |
| -# valid_img = False |
821 |
| -# self.debug(tounicode(el)) |
822 |
| -# for img in imgs: |
823 |
| -# |
824 |
| -# height = img.get('height') |
825 |
| -# text_length = img.get('text_length') |
826 |
| -# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) |
827 |
| -# if to_int(height) >= 100 or to_int(text_length) >= 100: |
828 |
| -# valid_img = True |
829 |
| -#
6F96
self.debug("valid image" + tounicode(img)) |
830 |
| -# break |
831 |
| -# if valid_img: |
832 |
| -# to_remove = False |
833 |
| -# self.debug("Allowing %s" %el.text_content()) |
834 |
| -# for desnode in tags(el, "table", "ul", "div"): |
835 |
| -# allowed[desnode] = True |
836 |
| - |
837 | 819 | # don't really understand what this is doing. Originally
|
838 | 820 | # the i/j were =+ which sets the value to 1. I think that
|
839 | 821 | # was supposed to be += which would increment. But then
|
|
0 commit comments