#!
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO PDF Processor
This module processes PDF files for intent generation.
"""
import os
import logging
import re
from collections import Counter
class PDFProcessor:
"""Processes PDF files for intent generation."""
def __init__(self):
"""Initialize the PDF processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None
def process(self, file_path):
"""
Process a PDF file.
Args:
file_path (str): Path to the PDF file
Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing PDF file: {os.path.basename(file_path)}")
self.on_progress(10)
# Try to import PyMuPDF
try:
import fitz # PyMuPDF
except ImportError:
raise ImportError("PyMuPDF (fitz) is required for PDF processing.
Please install it with 'pip install PyMuPDF'.")
self.on_progress(20)
# Open the PDF
self.on_status("Opening PDF...")
doc = fitz.open(file_path)
self.on_progress(30)
# Extract basic information
self.on_status("Extracting document information...")
info = {
'title': doc.metadata.get('title', ''),
'author': doc.metadata.get('author', ''),
'subject': doc.metadata.get('subject', ''),
'keywords': doc.metadata.get('keywords', ''),
'num_pages': len(doc),
'format': doc.metadata.get('format', '')
}
self.on_progress(40)
# Extract text
self.on_status("Extracting text...")
text = ""
for page_num in range(len(doc)):
self.on_status(f"Processing page {page_num + 1} of {len(doc)}...")
page = doc.load_page(page_num)
text += page.get_text()
self.on_progress(40 + int(50 * (page_num + 1) / len(doc)))
# Split into sentences
self.on_status("Splitting into sentences...")
sentences = self._split_into_sentences(text)
self.on_progress(90)
# Extract key phrases
self.on_status("Extracting key phrases...")
key_phrases = self._extract_key_phrases(sentences)
# Close the document
doc.close()
# Combine results
result = {
'info': info,
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}
self.on_progress(100)
self.on_status("PDF processing complete")
return result
except Exception as e:
logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
raise
def _split_into_sentences(self, text):
"""
Split text into sentences.
Args:
text (str): Text to split
Returns:
list: List of sentences
"""
# Clean text
text = re.sub(r'\s+', ' ', text)
# Simple sentence splitting
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter out empty sentences
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
def _extract_key_phrases(self, sentences):
"""
Extract key phrases from sentences.
Args:
sentences (list): List of sentences
Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
key_phrases = []
for sentence in sentences[:100]: # Limit to first 100 sentences
doc = nlp(sentence)
# Extract noun phrases
for chunk in doc.noun_chunks:
if len(chunk.text.split()) > 1: # Only multi-word phrases
key_phrases.append(chunk.text)
# Extract verb phrases
for token in doc:
if token.pos_ == "VERB":
phrase = token.text
for child in token.children:
if child.dep_ in ["dobj", "pobj"]:
phrase += " " + child.text
key_phrases.append(phrase)
# Count frequencies
phrase_counts = Counter(key_phrases)
# Return top phrases
return [phrase for phrase, count in phrase_counts.most_common(30)]
except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")
# Tokenize
words = []
for sentence in sentences[:100]: # Limit to first 100 sentences
words.extend(sentence.lower().split())
# Count word frequencies
word_counts = Counter(words)
# Get common bigrams
bigrams = []
for i in range(len(words) - 1):
bigrams.append(words[i] + " " + words[i + 1])
bigram_counts = Counter(bigrams)
# Return top phrases
return [phrase for phrase, count in bigram_counts.most_common(30)]