8000 run: add Run.iter_inner_content() · python-openxml/python-docx@08ee10a · GitHub
[go: up one dir, main page]

Skip to content

Commit 08ee10a

Browse files
committed
run: add Run.iter_inner_content()
1 parent 2364e90 commit 08ee10a

File tree

9 files changed

+226
-35
lines changed

9 files changed

+226
-35
lines changed

features/run-access-inner-content.feature

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ Feature: Access run inner-content including rendered page-breaks
1515
| two | True |
1616

1717

18-
@wip
1918
Scenario: Run.iter_inner_content() generates the run's text and rendered page-breaks
2019
Given a run having two rendered page breaks
2120
Then run.iter_inner_content() generates the run text and rendered page-breaks

src/docx/drawing/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
"""DrawingML-related objects are in this subpackage."""
2+
3+
from __future__ import annotations
4+
5+
from docx import types as t
6+
from docx.oxml.drawing import CT_Drawing
7+
from docx.shared import Parented
8+
9+
10+
class Drawing(Parented):
11+
"""Container for a DrawingML object."""
12+
13+
def __init__(self, drawing: CT_Drawing, parent: t.StoryChild):
14+
super().__init__(parent)
15+
self._parent = parent
16+
self._drawing = self._element = drawing

src/docx/oxml/__init__.py

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,22 @@
55

66
from __future__ import annotations
77

8+
from docx.oxml.drawing import CT_Drawing
89
from docx.oxml.parser import register_element_cls
10+
from docx.oxml.shape import (
11+
CT_Blip,
12+
CT_BlipFillProperties,
13+
CT_GraphicalObject,
14+
CT_GraphicalObjectData,
15+
CT_Inline,
16+
CT_NonVisualDrawingProps,
17+
CT_Picture,
18+
CT_PictureNonVisual,
19+
CT_Point2D,
20+
CT_PositiveSize2D,
21+
CT_ShapeProperties,
22+
CT_Transform2D,
23+
)
924
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
1025
from docx.oxml.text.run import (
1126
CT_R,
@@ -16,6 +31,25 @@
1631
CT_Text,
1732
)
1833

34+
# ---------------------------------------------------------------------------
35+
# DrawingML-related elements
36+
37+
register_element_cls("a:blip", CT_Blip)
38+
register_element_cls("a:ext", CT_PositiveSize2D)
39+
register_element_cls("a:graphic", CT_GraphicalObject)
40+
register_element_cls("a:graphicData", CT_GraphicalObjectData)
41+
register_element_cls("a:off", CT_Point2D)
42+
register_element_cls("a:xfrm", CT_Transform2D)
43+
register_element_cls("pic:blipFill", CT_BlipFillProperties)
44+
register_element_cls("pic:cNvPr", CT_NonVisualDrawingProps)
45+
register_element_cls("pic:nvPicPr", CT_PictureNonVisual)
46+
register_element_cls("pic:pic", CT_Picture)
47+
register_element_cls("pic:spPr", CT_ShapeProperties)
48+
register_element_cls("w:drawing", CT_Drawing)
49+
register_element_cls("wp:docPr", CT_NonVisualDrawingProps)
50+
register_element_cls("wp:extent", CT_PositiveSize2D)
51+
register_element_cls("wp:inline", CT_Inline)
52+
1953
# ---------------------------------------------------------------------------
2054
# text-related elements
2155

@@ -78,36 +112,6 @@
78112

79113
register_element_cls("w:settings", CT_Settings)
80114

81-
from .shape import ( # noqa
82-
CT_Blip,
83-
CT_BlipFillProperties,
84-
CT_GraphicalObject,
85-
CT_GraphicalObjectData,
86-
CT_Inline,
87-
CT_NonVisualDrawingProps,
88-
CT_Picture,
89-
CT_PictureNonVisual,
90-
CT_Point2D,
91-
CT_PositiveSize2D,
92-
CT_ShapeProperties,
93-
CT_Transform2D,
94-
)
95-
96-
register_element_cls("a:blip", CT_Blip)
97-
register_element_cls("a:ext", CT_PositiveSize2D)
98-
register_element_cls("a:graphic", CT_GraphicalObject)
99-
register_element_cls("a:graphicData", CT_GraphicalObjectData)
100-
register_element_cls("a:off", CT_Point2D)
101-
register_element_cls("a:xfrm", CT_Transform2D)
102-
register_element_cls("pic:blipFill", CT_BlipFillProperties)
103-
register_element_cls("pic:cNvPr", CT_NonVisualDrawingProps)
104-
register_element_cls("pic:nvPicPr", CT_PictureNonVisual)
105-
register_element_cls("pic:pic", CT_Picture)
106-
register_element_cls("pic:spPr", CT_ShapeProperties)
107-
register_element_cls("wp:docPr", CT_NonVisualDrawingProps)
108-
register_element_cls("wp:extent", CT_PositiveSize2D)
109-
register_element_cls("wp:inline", CT_Inline)
110-
111115
from .styles import CT_LatentStyles, CT_LsdException, CT_Style, CT_Styles # noqa
112116

113117
register_element_cls("w:basedOn", CT_String)

src/docx/oxml/drawing.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""Custom element-classes for DrawingML-related elements like `<w:drawing>`.
2+
3+
For legacy reasons, many DrawingML-related elements are in `docx.oxml.shape`. Expect
4+
those to move over here as we have reason to touch them.
5+
"""
6+
7+
from docx.oxml.xmlchemy import BaseOxmlElement
8+
9+
10+
class CT_Drawing(BaseOxmlElement):
11+
"""`<w:drawing>` element, containing a DrawingML object like a picture or chart."""

src/docx/oxml/text/run.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22

33
from __future__ import annotations
44

5-
from typing import TYPE_CHECKING, Callable, List
5+
from typing import TYPE_CHECKING, Callable, Iterator, List
66

7+
from docx.oxml.drawing import CT_Drawing
78
from docx.oxml.ns import qn
89
from docx.oxml.simpletypes import ST_BrClear, ST_BrType
910
from docx.oxml.text.font import CT_RPr
1011
from docx.oxml.xmlchemy import BaseOxmlElement, OptionalAttribute, ZeroOrMore, ZeroOrOne
12+
from docx.shared import TextAccumulator
1113

1214
if TYPE_CHECKING:
1315
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
@@ -52,6 +54,35 @@ def clear_content(self):
5254
for child in content_child_elms:
5355
self.remove(child)
5456

57+
@property
58+
def inner_content_items(self) -> List[str | CT_Drawing | CT_LastRenderedPageBreak]:
59+
"""Text of run, possibly punctuated by `w:lastRenderedPageBreak` elements."""
60+
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
61+
62+
accum = TextAccumulator()
63+
64+
def iter_items() -> Iterator[str | CT_Drawing | CT_LastRenderedPageBreak]:
65+
for e in self.xpath(
66+
"w:br"
67+
" | w:cr"
68+
" | w:drawing"
69+
" | w:lastRenderedPageBreak"
70+
" | w:noBreakHyphen"
71+
" | w:ptab"
72+
" | w:t"
73+
" | w:tab"
74+
):
75+
if isinstance(e, (CT_Drawing, CT_LastRenderedPageBreak)):
76+
yield from accum.pop()
77+
yield e
78+
else:
79+
accum.push(str(e))
80+
81+
# -- don't forget the "tail" string --
82+
yield from accum.pop()
83+
84+
return list(iter_items())
85+
5586
@property
5687
def lastRenderedPageBreaks(self) -> List[CT_LastRenderedPageBreak]:
5788
"""All `w:lastRenderedPageBreaks` descendants of this run."""

src/docx/shared.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from __future__ import annotations
44

55
import functools
6-
from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar, cast
6+
from typing import TYPE_CHECKING, Any, Callable, Generic, Iterator, List, TypeVar, cast
77

88
if TYPE_CHECKING:
99
from docx.oxml.xmlchemy import BaseOxmlElement
@@ -315,3 +315,32 @@ def __init__(self, parent):
315315
def part(self):
316316
"""The package part containing this object."""
317317
return self._parent.part
318+
319+
320+
class TextAccumulator:
321+
"""Accepts `str` fragments and joins them together, in order, on `.pop().
322+
323+
Handy when text in a stream is broken up arbitrarily and you want to join it back
324+
together within certain bounds. The optional `separator` argument determines how
325+
the text fragments are punctuated, defaulting to the empty string.
326+
"""
327+
328+
def __init__(self, separator: str = ""):
329+
self._separator = separator
330+
self._texts: List[str] = []
331+
332+
def push(self, text: str) -> None:
333+
"""Add a text fragment to the accumulator."""
334+
self._texts.append(text)
335+
336+
def pop(self) -> Iterator[str]:
337+
"""Generate sero-or-one str from those accumulated.
338+
339+
Using `yield from accum.pop()` in a generator setting avoids producing an empty
340+
string when no text is in the accumulator.
341+
"""
342+
if not self._texts:
343+
return
344+
text = self._separator.join(self._texts)
345+
self._texts.clear()
346+
yield text

src/docx/text/pagebreak.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Proxy objects related to rendered page-breaks."""
2+
3+
from __future__ import annotations
4+
5+
from docx import types as t
6+
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
7+
from docx.shared import Parented
8+
9+
10+
class RenderedPageBreak(Parented):
11+
"""A page-break inserted by Word during page-layout for print or display purposes.
12+
13+
This usually does not correspond to a "hard" page-break inserted by the document
14+
author, rather just that Word ran out of room on one page and needed to start
15+
another. The position of these can change depending on the printer and page-size, as
16+
well as margins, etc. They also will change in response to edits, but not until Word
17+
loads and saves the document.
18+
19+
Note these are never inserted by `python-docx` because it has no rendering function.
20+
These are generally only useful for text-extraction of existing documents when
21+
`python-docx` is being used solely as a document "reader".
22+
"""
23+
24+
def __init__(
25+
self, lastRenderedPageBreak: CT_LastRenderedPageBreak, parent: t.StoryChild
26+
):
27+
super().__init__(parent)
28+
self._element = lastRenderedPageBreak
29+
self._lastRenderedPageBreak = lastRenderedPageBreak

src/docx/text/run.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,20 @@
22

33
from __future__ import annotations
44

5-
from typing import IO
5+
from typing import IO, Iterator
66

77
from docx import types as t
8+
from docx.drawing import Drawing
89
from docx.enum.style import WD_STYLE_TYPE
910
from docx.enum.text import WD_BREAK
11+
from docx.oxml.drawing import CT_Drawing
12+
from docx.oxml.text.pagebreak import CT_LastRenderedPageBreak
1013
from docx.oxml.text.run import CT_R, CT_Text
1114
from docx.shape import InlineShape
1215
from docx.shared import Length, Parented
1316
from docx.styles.style import CharacterStyle
1417
from docx.text.font import Font
18+
from docx.text.pagebreak import RenderedPageBreak
1519

1620

1721
class Run(Parented):
@@ -135,6 +139,31 @@ def italic(self) -> bool:
135139
def italic(self, value: bool):
136140
self.font.italic = value
137141

142+
def iter_inner_content(self) -> Iterator[str | Drawing | RenderedPageBreak]:
143+
"""Generate the content-items in this run in the order they appear.
144+
145+
NOTE: only content-types currently supported by `python-docx` are generated. In
146+
this version, that is text and rendered page-breaks. Drawing is included but
147+
currently only provides access to its XML element (CT_Drawing) on its
148+
`._drawing` attribute. `Drawing` attributes and methods may be expanded in
149+
future releases.
150+
151+
There are a number of element-types that can appear inside a run, but most of
152+
those (w:br, w:cr, w:noBreakHyphen, w:t, w:tab) have a clear plain-text
153+
equivalent. Any contiguous range of such elements is generated as a single
154+
`str`. Rendered page-break and drawing elements are generated individually. Any
155+
other elements are ignored.
156+
"""
157+
for item in self._r.inner_content_items:
158+
if isinstance(item, str):
159+
yield item
160+
elif isinstance(item, CT_LastRenderedPageBreak):
161+
yield RenderedPageBreak(item, self)
162+
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
163+
item, CT_Drawing
164+
):
165+
yield Drawing(item, self)
166+
138167
@property
139168
def style(self) -> CharacterStyle | None:
140169
"""Read/write.

tests/text/test_run.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,16 @@
22

33
from __future__ import annotations
44

5-
from typing import cast
5+
from typing import List, cast
66

77
import pytest
88

9+
from docx import types as t
910
from docx.enum.style import WD_STYLE_TYPE
1011
from docx.enum.text import WD_BREAK, WD_UNDERLINE
1112
from docx.oxml.text.run import CT_R
1213
from docx.parts.document import DocumentPart
14+
from docx.parts.story import StoryPart
1315
from docx.shape import InlineShape
1416
from docx.text.font import Font
1517
from docx.text.run import Run
@@ -19,6 +21,8 @@
1921

2022

2123
class DescribeRun(object):
24+
"""Unit-test suite for `docx.text.run.Run`."""
25+
2226
def it_knows_its_bool_prop_states(self, bool_prop_get_fixture):
2327
run, prop_name, expected_state = bool_prop_get_fixture
2428
assert getattr(run, prop_name) == expected_state
@@ -45,6 +49,36 @@ def it_knows_whether_it_contains_a_page_break(
4549

4650
assert run.contains_page_break == expected_value
4751

52+
@pytest.mark.parametrize(
53+
("r_cxml", "expected"),
54+
[
55+
# -- no content produces an empty iterator --
56+
("w:r", []),
57+
# -- contiguous text content is condensed into a single str --
58+
('w:r/(w:t"foo",w:cr,w:t"bar")', ["str"]),
59+
# -- page-breaks are a form of inner-content --
60+
(
61+
'w:r/(w:t"abc",w:br,w:lastRenderedPageBreak,w:noBreakHyphen,w:t"def")',
62+
["str", "RenderedPageBreak", "str"],
63+
),
64+
# -- as are drawings --
65+
(
66+
'w:r/(w:t"abc", w:lastRenderedPageBreak, w:drawing)',
67+
["str", "RenderedPageBreak", "Drawing"],
68+
),
69+
],
70+
)
71+
def it_can_iterate_its_inner_content_items(
72+
self, r_cxml: str, expected: List[str], fake_parent: t.StoryChild
73+
):
74+
r = cast(CT_R, element(r_cxml))
75+
run = Run(r, fake_parent)
76+
77+
inner_content = run.iter_inner_content()
78+
79+
actual = [type(item).__name__ for item in inner_content]
80+
assert actual == expected, f"expected: {expected}, got: {actual}"
81+
4882
def it_knows_its_character_style(self, style_get_fixture):
4983
run, style_id_, style_ = style_get_fixture
5084
style = run.style
@@ -244,6 +278,15 @@ def clear_fixture(self, request):
244278
expected_xml = xml(expected_cxml)
245279
return run, expected_xml
246280

281+
@pytest.fixture
282+
def fake_parent(self) -> t.StoryChild:
283+
class StoryChild:
284+
@property
285+
def part(self) -> StoryPart:
286+
raise NotImplementedError
287+
288+
return StoryChild()
289+
247290
@pytest.fixture
248291
def font_fixture(self, Font_, font_):
249292
run = Run(element("w:r"), None)

0 commit comments

Comments
 (0)
0