8000 Fix test for the multipage test with actual content · gui199/python-readability@cfc6f94 · GitHub
[go: up one dir, main page]

Skip to content

Commit cfc6f94

Browse files
committed
Fix test for the multipage test with actual content
1 parent 816c664 commit cfc6f94

File tree

3 files changed

+42
-67
lines changed

3 files changed

+42
-67
lines changed

src/readability_lxml/readability.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from collections import namedtuple
1010
from lxml.etree import tostring
1111
from lxml.etree import tounicode
12-
from lxml.html.diff import htmldiff
1312
from lxml.html import document_fromstring
1413
from lxml.html import fragment_fromstring
1514

@@ -448,7 +447,11 @@ def get_raw_article(candidates, best_candidate, enclose_with_html_tag=True):
448447
# We don't want to append directly to output, but the div
449448
# in html->body->div
450449
if enclose_with_html_tag:
451-
output.getchildren()[0].getchildren()[0].append(sibling)
450+
if sibling.tag == 'body':
451+
for elem in sibling.getchildren():
452+
output.getchildren()[0].getchildren()[0].append(elem)
453+
else:
454+
output.getchildren()[0].getchildren()[0].append(sibling)
452455
else:
453456
output.append(sibling)
454457

@@ -824,8 +827,20 @@ def append_next_page(parsed_urls, page_url, doc, options):
824827
# page_doc is a singular element containing the page article elements. We
825828
# want to add its children to the main article document to which we are
826829
# appending a page.
827-
for elem in page_doc:
828-
doc.append(elem)
830+
if doc.tag == 'html':
831+
children = doc.getchildren()
832+
if children[0].tag == 'head':
833+
import ipdb; ipdb.set_trace()
834+
for elem in page_doc:
835+
doc.getchildren()[1].append(elem)
836+
else:
837+
import ipdb; ipdb.set_trace()
838+
for elem in page_doc:
839+
doc.getchildren()[0].append(elem)
840+
else:
841+
import ipdb; ipdb.set_trace()
842+
for elem in page_doc:
843+
doc.append(elem)
829844
if next_page_url is not None:
830845
append_next_page(parsed_urls, next_page_url, doc, options)
831846

src/tests/test_readability.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import os
22
import unittest
33

4+
from lxml.html import document_fromstring
5+
from lxml.html.diff import htmldiff
6+
47
from helpers import load_regression_data
58
from helpers import REGRESSION_DATA
69
from readability_lxml.readability import Document
@@ -209,7 +212,24 @@ def test_basic(self):
209212
'urlfetch': fetcher
210213
}
211214
doc = Document(html, **options)
212-
res = doc.summary()
215+
res = doc.summary_with_metadata()
216+
217+
self.assertIn('Page 2', res.html, 'Should find the page 2 heading')
218+
self.assertIn('Page 3', res.html, 'Should find the page 3 heading')
219+
220+
expected_html = load_regression_data('basic-multi-page-expected.html')
221+
diff_html = htmldiff(expected_html, res.html)
222+
diff_doc = document_fromstring(diff_html)
223+
224+
insertions = diff_doc.xpath('//ins')
225+
deletions = diff_doc.xpath('//del')
226+
227+
if len(insertions) != 0:
228+
for i in insertions:
229+
print('unexpected insertion: %s' % i.xpath('string()'))
230+
self.fail('readability result does not match expected')
213231

214-
self.assertIn('Page 2', res, 'Should find the page 2 heading')
215-
self.assertIn('Page 3', res, 'Should find the page 3 heading')
232+
if len(deletions) != 0:
233+
for i in deletions:
234+
print('unexpected deletion: %s' % i.xpath('string()'))
235+
self.fail('readability result does not match expected')

test_data/basic-multi-page-3.html

Lines changed: 0 additions & 60 deletions
This file was deleted.

0 commit comments

Comments
 (0)
0