10000 run: Run.text includes no-break hyphen, etc. · python-openxml/python-docx@2e13d5c · GitHub
[go: up one dir, main page]

Skip to content

Commit 2e13d5c

Browse files
committed
run: Run.text includes no-break hyphen, etc.
Add additional run inner-content elements having a text equivalent, in particular `w:noBreakHyphen` and `w:ptab`. Give each of them their own custom element class having a `__str__()` method so they can each report their text content (constant in some cases like "-" for no-break hyphen).
1 parent ceb8cbe commit 2e13d5c

File tree

9 files changed

+150
-32
lines changed

9 files changed

+150
-32
lines changed

features/run-access-inner-content.feature

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ Feature: Access run inner-content including rendered page-breaks
2222
Then run.iter_inner_content() generates the run text and rendered page-breaks
2323

2424

25-
@wip
2625
Scenario: Run.text contains the text content of the run
2726
Given a run having mixed text content
2827
Then run.text contains the text content of the run

features/steps/paragraph.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,11 @@ def then_the_document_contains_four_paragraphs(context):
118118
def then_document_contains_text_I_added(context):
119119
document = Document(saved_docx_path)
120120
paragraphs = document.paragraphs
121-
p = paragraphs[-1]
122-
r = p.runs[0]
123-
assert r.text == test_text
121+
paragraph = paragraphs[-1]
122+
run = paragraph.runs[0]
123+
actual = run.text
124+
expected = test_text
125+
assert actual == expected, f"expected: {expected}, got: {actual}"
124126

125127

126128
@then("the paragraph alignment property value is {align_value}")
@@ -153,7 +155,9 @@ def then_the_paragraph_has_the_style_I_set(context):
153155

154156
@then("the paragraph has the text I set")
155157
def then_the_paragraph_has_the_text_I_set(context):
156-
assert context.paragraph.text == "bar\tfoo\n"
158+
actual = context.paragraph.text
159+
expected = "bar\tfoo\n"
160+
assert actual == expected, f"expected: {expected}, got: {actual}"
157161

158162

159163
@then("the style of the second paragraph matches the style I set")

src/docx/enum/base.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,21 @@
11
"""Base classes and other objects used by enumerations."""
22

3+
from __future__ import annotations
4+
5+
import enum
36
import sys
47
import textwrap
8+
from typing import Callable, Type
9+
10+
from docx.exceptions import InvalidXmlError
511

6-
from ..exceptions import InvalidXmlError
712

13+
def alias(*aliases: str) -> Callable[..., Type[enum.Enum]]:
14+
"""Adds alternate name for an enumeration.
815
9-
def alias(*aliases):
10-
"""Decorating a class with @alias('FOO', 'BAR', ..) allows the class to be
11-
referenced by each of the names provided as arguments."""
16+
Decorating a class with @alias('FOO', 'BAR', ..) allows the class to be referenced
17+
by each of the names provided as arguments.
18+
"""
1219

1320
def decorator(cls):
1421
# alias must be set in globals from caller's frame

src/docx/enum/text.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
"""Enumerations related to text in WordprocessingML files."""
22

3-
from .base import EnumMember, XmlEnumeration, XmlMappedEnumMember, alias
3+
import enum
4+
from typing import ClassVar
5+
6+
from docx.enum.base import EnumMember, XmlEnumeration, XmlMappedEnumMember, alias
47

58

69
@alias("WD_ALIGN_PARAGRAPH")
@@ -59,6 +62,9 @@ class WD_PARAGRAPH_ALIGNMENT(XmlEnumeration):
5962
)
6063

6164

65+
WD_ALIGN_PARAGRAPH = WD_PARAGRAPH_ALIGNMENT
66+
67+
6268
class WD_BREAK_TYPE(object):
6369
"""Corresponds to WdBreakType enumeration http://msdn.microsoft.com/en-
6470
us/library/office/ff195905.aspx."""
@@ -184,6 +190,8 @@ class WD_TAB_ALIGNMENT(XmlEnumeration):
184190
class WD_TAB_LEADER(XmlEnumeration):
185191
"""Specifies the character to use as the leader with formatted tabs."""
186192

193+
SPACES: ClassVar[enum.Enum]
194+
187195
__ms_name__ = "WdTabLeader"
188196

189197
__url__ = "https://msdn.microsoft.com/en-us/library/office/ff845050.aspx"

src/docx/oxml/__init__.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,28 @@
66
from __future__ import annotations
77

88
from docx.oxml.parser import register_element_cls
9+
from docx.oxml.text.run import (
10+
CT_R,
11+
CT_Br,
12+
CT_Cr,
13+
CT_NoBreakHyphen,
14+
CT_PTab,
15+
CT_Text,
16+
)
17+
18+
# ---------------------------------------------------------------------------
19+
# text-related elements
20+
21+
register_element_cls("w:br", CT_Br)
22+
register_element_cls("w:cr", CT_Cr)
23+
register_element_cls("w:noBreakHyphen", CT_NoBreakHyphen)
24+
register_element_cls("w:ptab", CT_PTab)
25+
register_element_cls("w:r", CT_R)
26+
register_element_cls("w:t", CT_Text)
27+
28+
# ---------------------------------------------------------------------------
29+
# other custom element class mappings
930

10-
# ===========================================================================
11-
# custom element class mappings
12-
# ===========================================================================
1331
from .shared import CT_DecimalNumber, CT_OnOff, CT_String # noqa
1432

1533
register_element_cls("w:evenAndOddHeaders", CT_OnOff)
@@ -199,9 +217,3 @@
199217
register_element_cls("w:tab", CT_TabStop)
200218
register_element_cls("w:tabs", CT_TabStops)
201219
register_element_cls("w:widowControl", CT_OnOff)
202-
203-
from .text.run import CT_Br, CT_R, CT_Text # noqa
204-
205-
register_element_cls("w:br", CT_Br)
206-
register_element_cls("w:r", CT_R)
207-
register_element_cls("w:t", CT_Text)

src/docx/oxml/text/parfmt.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,12 +321,24 @@ class CT_Spacing(BaseOxmlElement):
321321

322322

323323
class CT_TabStop(BaseOxmlElement):
324-
"""`<w:tab>` element, representing an individual tab stop."""
324+
"""`<w:tab>` element, representing an individual tab stop.
325+
326+
Overloaded to use for a tab-character in a run, which also uses the w:tab tag but
327+
only needs a __str__ method.
328+
"""
325329

326330
val = RequiredAttribute("w:val", WD_TAB_ALIGNMENT)
327331
leader = OptionalAttribute("w:leader", WD_TAB_LEADER, default=WD_TAB_LEADER.SPACES)
328332
pos = RequiredAttribute("w:pos", ST_SignedTwipsMeasure)
329333

334+
def __str__(self) -> str:
335+
"""Text equivalent of a `w:tab` element appearing in a run.
336+
337+
Allows text of run inner-content to be accessed consistently across all text
338+
inner-content.
339+
"""
340+
return "\t"
341+
330342

331343
class CT_TabStops(BaseOxmlElement):
332344
"""``<w:tabs>`` element, container for a sorted sequence of tab stops."""

src/docx/oxml/text/run.py

Lines changed: 81 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,16 +76,10 @@ def text(self) -> str:
7676
Inner-content child elements like `w:tab` are translated to their text
7777
equivalent.
7878
"""
79-
text = ""
80-
for child in self:
81-
if child.tag == qn("w:t"):
82-
t_text = child.text
83-
text += t_text if t_text is not None else ""
84-
elif child.tag == qn("w:tab"):
85-
text += "\t"
86-
elif child.tag in (qn("w:br"), qn("w:cr")):
87-
text += "\n"
88-
return text
79+
return "".join(
80+
str(e)
81+
for e in self.xpath("w:br | w:cr | w:noBreakHyphen | w:ptab | w:t | w:tab")
82+
)
8983

9084
@text.setter
9185
def text(self, text: str):
@@ -104,13 +98,89 @@ def _insert_rPr(self, rPr: CT_RPr) -> CT_RPr:
10498
class CT_Br(BaseOxmlElement):
10599
"""`<w:br>` element, indicating a line, page, or column break in a run."""
106100

107-
type = OptionalAttribute("w:type", ST_BrType, default="textWrapping")
101+
type: str | None = OptionalAttribute( # pyright: ignore[reportGeneralTypeIssues]
102+
"w:type", ST_BrType, default="textWrapping"
103+
)
108104
clear = OptionalAttribute("w:clear", ST_BrClear)
109105

106+
def __str__(self) -> str:
107+
"""Text equivalent of this element. Actual value depends on break type.
108+
109+
A line break is translated as "\n". Column and page breaks produce the empty
110+
string ("").
111+
112+
This allows the text of run inner-content to be accessed in a consistent way
113+
for all run inner-context text elements.
114+
"""
115+
return "\n" if self.type == "textWrapping" else ""
116+
117+
118+
class CT_Cr(BaseOxmlElement):
119+
"""`<w:cr>` element, representing a carriage-return (0x0D) character within a run.
120+
121+
In Word, this represents a "soft carriage-return" in the sense that it does not end
122+
the paragraph the way pressing Enter (aka. Return) on the keyboard does. Here the
123+
text equivalent is considered to be newline ("\n") since in plain-text that's the
124+
closest Python equivalent.
125+
126+
NOTE: this complex-type name does not exist in the schema, where `w:tab` maps to
127+
`CT_Empty`. This name was added to give it distinguished behavior. CT_Empty is used
128+
for many elements.
129+
"""
130+
131+
def __str__(self) -> str:
132+
"""Text equivalent of this element, a single newline ("\n")."""
133+
return "\n"
134+
135+
136+
class CT_NoBreakHyphen(BaseOxmlElement):
137+
"""`<w:noBreakHyphen>` element, a hyphen ineligible for a line-wrap position.
138+
139+
This maps to a plain-text dash ("-").
140+
141+
NOTE: this complex-type name does not exist in the schema, where `w:noBreakHyphen`
142+
maps to `CT_Empty`. This name was added to give it behavior distinguished from the
143+
many other elements represented in the schema by CT_Empty.
144+
"""
145+
146+
def __str__(self) -> str:
147+
"""Text equivalent of this element, a single dash character ("-")."""
148+
return "-"
149+
150+
151+
class CT_PTab(BaseOxmlElement):
152+
"""`<w:ptab>` element, representing an absolute-position tab character within a run.
153+
154+
This character advances the rendering position to the specified position regardless
155+
of any tab-stops, perhaps for layout of a table-of-contents (TOC) or similar.
156+
"""
157+
158+
def __str__(self) -> str:
159+
"""Text equivalent of this element, a single tab ("\t") character.
160+
161+
This allows the text of run inner-content to be accessed in a consistent way
162+
for all run inner-context text elements.
163+
"""
164+
return "\t"
165+
166+
167+
# -- CT_Tab functionality is provided by CT_TabStop which also uses `w:tab` tag. That
168+
# -- element class provides the __str__() method for this empty element, unconditionally
169+
# -- returning "\t".
170+
110171

111172
class CT_Text(BaseOxmlElement):
112173
"""`<w:t>` element, containing a sequence of characters within a run."""
113174

175+
def __str__(self) -> str:
176+
"""Text contained in this element, the empty string if it has no content.
177+
178+
This property allows this run inner-content element to be queried for its text
179+
the same way as other run-content elements are. In particular, this never
180+
returns None, as etree._Element does when there is no content.
181+
"""
182+
return self.text or ""
183+
114184

115185
# ------------------------------------------------------------------------------------
116186
# Utility

tests/oxml/text/test_run.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,9 @@ def it_can_add_a_t_preserving_edge_whitespace(
3333
r.add_t(text)
3434

3535
assert r.xml == expected_xml
36+
37+
def it_can_assemble_the_text_in_the_run(self):
38+
cxml = 'w:r/(w:br,w:cr,w:noBreakHyphen,w:ptab,w:t"foobar",w:tab)'
39+
r = cast(CT_R, element(cxml))
40+
41+
assert r.text == "\n\n-\tfoobar\t"

tests/text/test_run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def it_can_remove_its_content_but_keep_formatting(self, clear_fixture):
117117
("w:r", ""),
118118
('w:r/w:t"foobar"', "foobar"),
119119
('w:r/(w:t"abc", w:tab, w:t"def", w:cr)', "abc\tdef\n"),
120-
('w:r/(w:br{w:type=page}, w:t"abc", w:t"def", w:tab)', "\nabcdef\t"),
120+
('w:r/(w:br{w:type=page}, w:t"abc", w:t"def", w:tab)', "abcdef\t"),
121121
],
122122
)
123123
def it_knows_the_text_it_contains(self, r_cxml: str, expected_text: str):

0 commit comments

Comments
 (0)
0