Customizing Furniture and Body Recognition in Document Parsing #1338

Nilsonn · 2025-04-09T07:09:32Z

Nilsonn
Apr 9, 2025

Hey everyone,

I'm currently exploring the capabilities of docling for parsing PDF files and converting them into markdown. In my tests I've noticed that header images were not recognized as furniture, while the footer was successfully identified. I don't want to have the header image in the exported markdown file.

Are there options to:

customize what is recognized as furniture or body within a document
customize what part of the document is exported (e.g. with a bounding box or margins)
Or are there any other options to exclude the header images from the exported markdown?

Thank you!

justinterveystech · 2026-01-15T11:40:10Z

justinterveystech
Jan 15, 2026

from pathlib import Path
from docling_core.types.doc import ContentLayer, DocItemLabel
from collections import defaultdict
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
EasyOcrOptions,
AcceleratorOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

Try this import first (most common in 2025–2026 releases)

try:
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
except ImportError:
# Fallback for some versions that use plural or different path
from docling.backends.pdf_backend import PyPdfiumDocumentBackend

INPUT_PDF = Path("out.pdf")

def extract_all_text_robust(pdf_path: Path) -> str:
print("Starting robust OCR + table extraction...\n")

options = PdfPipelineOptions()
options.do_ocr = True
options.do_table_structure = True
options.table_structure_options.do_cell_matching = True
options.ocr_options = EasyOcrOptions(
    force_full_page_ocr=True,
    lang=["en"]
)
options.accelerator_options = AcceleratorOptions(
num_threads=1,  # e.g. 8, 12, 16...
device="auto"

)

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
            pipeline_options=options,
            backend=PyPdfiumDocumentBackend   # ← this is the key
        )
    }
)

print("Converting... (OCR can be slow on first run)\n")
result = converter.convert(pdf_path)
doc = result.document


print("\n=== Document Inspection ===\n")

# 1. Basic overview
print(f"Total text items: {len(doc.texts)}")
print(f"Total tables:     {len(doc.tables)}")
print(f"Total pictures:   {len(doc.pictures) if hasattr(doc, 'pictures') else 'N/A'}")
print(f"Body root:        {doc.body is not None}")
print(f"Furniture root:   {doc.furniture is not None}\n")   # may be deprecated in very new versions

# 2. Count by content layer & label (most useful!)
stats = defaultdict(lambda: defaultdict(int))

for item, level in doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
    layer = item.content_layer.name if hasattr(item, 'content_layer') else "UNKNOWN"
    label = item.label.name if hasattr(item, 'label') else "NO_LABEL"
    stats[layer][label] += 1
    
    # Special attention to potential title/header
    text_snippet = (item.text[:60] + "...") if hasattr(item, 'text') and item.text else ""
    if "Aplio" in text_snippet or "a550" in text_snippet.lower():
        print(f"*** Possible title/header found ***")
        print(f"  Layer: {layer}")
        print(f"  Label: {label}")
        print(f"  Level: {level}")
        print(f"  Text : {text_snippet}")
        print(f"  Self-ref: {item.self_ref}\n")

print("Element counts by layer & label:")
for layer in sorted(stats):
    print(f"\n{layer}:")
    for lbl, cnt in sorted(stats[layer].items(), key=lambda x: x[1], reverse=True):
        print(f"  {lbl:18} : {cnt:3d}")

# 3. Quick look at first few items in reading order (including furniture)
print("\nFirst 15 items in reading order (with layer):")
count = 0
for item, level in doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
    if count >= 15:
        break
    layer_name = item.content_layer.name if hasattr(item, 'content_layer') else "?"
    label_name = item.label.name if hasattr(item, 'label') else "?"
    text_prev = (item.text[:70].replace("\n", " ") + "...") if hasattr(item, 'text') else "[no text]"
    print(f"{layer_name:10} | {label_name:18} | lvl {level:2} | {text_prev}")
    count += 1

# Build full Markdown manually (includes furniture in order, table MD, footnotes, no losses)
print("\nBuilding full Markdown from all items...\n")
md_parts = []
for item, level in doc.iterate_items(included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}):
    if hasattr(item, 'label') and item.label == DocItemLabel.PAGE_HEADER:
        if hasattr(item, 'text') and item.text.strip():
            md_parts.append(f"# {item.text.strip()}\n\n")
    elif hasattr(item, 'label') and item.label == DocItemLabel.TABLE:
        md_parts.append(item.export_to_markdown() + "\n\n")
    elif hasattr(item, 'label') and item.label == DocItemLabel.FOOTNOTE:
        if hasattr(item, 'text') and item.text.strip():
            md_parts.append(f"{item.text.strip()}\n")
    else:
        # Fallback for any other items (though unlikely in this doc)
        if hasattr(item, 'text') and item.text.strip():
            md_parts.append(f"{item.text.strip()}\n\n")

full_md = "".join(md_parts).strip()
return full_md

def main():
print("=== Docling Full OCR Extraction ===\n")
text = extract_all_text_robust(INPUT_PDF)

print("─" * 80)
print(text)
print("─" * 80)
print(f"\nCharacters extracted: {len(text):,}")
print("Done ✓")

if name == "main":
main()

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Customizing Furniture and Body Recognition in Document Parsing #1338

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Customizing Furniture and Body Recognition in Document Parsing #1338

Uh oh!

Nilsonn Apr 9, 2025

Replies: 1 comment

Uh oh!

justinterveystech Jan 15, 2026

Try this import first (most common in 2025–2026 releases)

Nilsonn
Apr 9, 2025

justinterveystech
Jan 15, 2026