0% found this document useful (0 votes)

19 views4 pages

PDF Processor

The LEO PDF Processor module is designed to process PDF files for intent generation by extracting metadata, text, and key phrases. It utilizes the PyMuPDF library for PDF handling and can optionally use spaCy for advanced key phrase extraction. The module provides progress updates and handles errors during the processing of PDF files.

Uploaded by

raynyx77

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

19 views4 pages

PDF Processor

Uploaded by

raynyx77

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 4

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
LEO PDF Processor

This module processes PDF files for intent generation.

"""

import os
import logging
import re
from collections import Counter

class PDFProcessor:
"""Processes PDF files for intent generation."""

def __init__(self):
"""Initialize the PDF processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None

def process(self, file_path):

"""
Process a PDF file.

Args:
file_path (str): Path to the PDF file

Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing PDF file: {os.path.basename(file_path)}")
self.on_progress(10)

# Try to import PyMuPDF

try:
import fitz # PyMuPDF
except ImportError:
raise ImportError("PyMuPDF (fitz) is required for PDF processing.
Please install it with 'pip install PyMuPDF'.")

self.on_progress(20)

# Open the PDF

self.on_status("Opening PDF...")
doc = fitz.open(file_path)

self.on_progress(30)

# Extract basic information

self.on_status("Extracting document information...")
info = {
'title': doc.metadata.get('title', ''),
'author': doc.metadata.get('author', ''),
'subject': doc.metadata.get('subject', ''),
'keywords': doc.metadata.get('keywords', ''),
'num_pages': len(doc),
'format': doc.metadata.get('format', '')
}

self.on_progress(40)

# Extract text
self.on_status("Extracting text...")
text = ""
for page_num in range(len(doc)):
self.on_status(f"Processing page {page_num + 1} of {len(doc)}...")
page = doc.load_page(page_num)
text += page.get_text()
self.on_progress(40 + int(50 * (page_num + 1) / len(doc)))

# Split into sentences

self.on_status("Splitting into sentences...")
sentences = self._split_into_sentences(text)

self.on_progress(90)

# Extract key phrases

self.on_status("Extracting key phrases...")
key_phrases = self._extract_key_phrases(sentences)

# Close the document

doc.close()

# Combine results
result = {
'info': info,
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}

self.on_progress(100)
self.on_status("PDF processing complete")

return result

except Exception as e:
logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
raise

def _split_into_sentences(self, text):

"""
Split text into sentences.

Args:
text (str): Text to split

Returns:
list: List of sentences
"""
# Clean text
text = re.sub(r'\s+', ' ', text)

# Simple sentence splitting

sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter out empty sentences
sentences = [s.strip() for s in sentences if s.strip()]

return sentences

def _extract_key_phrases(self, sentences):

"""
Extract key phrases from sentences.

Args:
sentences (list): List of sentences

Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy

# Load spaCy model

nlp = spacy.load("en_core_web_sm")

key_phrases = []

for sentence in sentences[:100]: # Limit to first 100 sentences

doc = nlp(sentence)

# Extract noun phrases

for chunk in doc.noun_chunks:
if len(chunk.text.split()) > 1: # Only multi-word phrases
key_phrases.append(chunk.text)

# Extract verb phrases

for token in doc:
if token.pos_ == "VERB":
phrase = token.text
for child in token.children:
if child.dep_ in ["dobj", "pobj"]:
phrase += " " + child.text
key_phrases.append(phrase)

# Count frequencies
phrase_counts = Counter(key_phrases)

# Return top phrases

return [phrase for phrase, count in phrase_counts.most_common(30)]

except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")

# Tokenize
words = []
for sentence in sentences[:100]: # Limit to first 100 sentences
words.extend(sentence.lower().split())

# Count word frequencies

word_counts = Counter(words)

# Get common bigrams

bigrams = []
for i in range(len(words) - 1):
bigrams.append(words[i] + " " + words[i + 1])

bigram_counts = Counter(bigrams)

# Return top phrases

return [phrase for phrase, count in bigram_counts.most_common(30)]

Text Processor
No ratings yet
Text Processor
3 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
Notes - by Kishor
No ratings yet
Notes - by Kishor
11 pages
File Handling Practice Red
No ratings yet
File Handling Practice Red
3 pages
Untitled Document
No ratings yet
Untitled Document
18 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Introduction
No ratings yet
Introduction
17 pages
Bling
No ratings yet
Bling
7 pages
NLP Text Processing Techniques
No ratings yet
NLP Text Processing Techniques
6 pages
卂几ㄖ几ㄚ
No ratings yet
卂几ㄖ几ㄚ
8 pages
Extracting Text and Images From PDF Files
No ratings yet
Extracting Text and Images From PDF Files
10 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
Python Script For PDF - Reading
No ratings yet
Python Script For PDF - Reading
2 pages
IR Assignment6
No ratings yet
IR Assignment6
5 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
Lecture 31-Document GPT Hands On
No ratings yet
Lecture 31-Document GPT Hands On
18 pages
Assignment 4
No ratings yet
Assignment 4
11 pages
Claude Comparet DB
No ratings yet
Claude Comparet DB
8 pages
Natural Language Processing Lab Manual
No ratings yet
Natural Language Processing Lab Manual
24 pages
NLP Practical Journal 2023-24
No ratings yet
NLP Practical Journal 2023-24
22 pages
Python File Handling Guide
No ratings yet
Python File Handling Guide
22 pages
CS Practical File
No ratings yet
CS Practical File
47 pages
CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
Assignment 2
No ratings yet
Assignment 2
4 pages
5 Python Fundamentals m04 Objects Slides
No ratings yet
5 Python Fundamentals m04 Objects Slides
89 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
Ass 3
No ratings yet
Ass 3
3 pages
TextSimp Summarization Project
No ratings yet
TextSimp Summarization Project
3 pages
NLP Day1
No ratings yet
NLP Day1
4 pages
NLP PRGRM-1
No ratings yet
NLP PRGRM-1
7 pages
Proj1 Fall24
No ratings yet
Proj1 Fall24
2 pages
Long Docs
No ratings yet
Long Docs
8 pages
NLP Lab - Manual
No ratings yet
NLP Lab - Manual
33 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
15 pages
Image To Doc Working
No ratings yet
Image To Doc Working
4 pages
Tsarecord
No ratings yet
Tsarecord
22 pages
Batch 2
No ratings yet
Batch 2
13 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
NLP Lab Codes Till Mod3
No ratings yet
NLP Lab Codes Till Mod3
7 pages
Language Engineering - Section
No ratings yet
Language Engineering - Section
20 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
Web Scraping & Inverted Index Guide
No ratings yet
Web Scraping & Inverted Index Guide
10 pages
Natural Language Processing in Python - Exploring Word Frequencies With NLTK
No ratings yet
Natural Language Processing in Python - Exploring Word Frequencies With NLTK
5 pages
Practical File by Aksh Jaiswal
No ratings yet
Practical File by Aksh Jaiswal
48 pages
NLP Techniques for Text Processing
No ratings yet
NLP Techniques for Text Processing
41 pages
Project
No ratings yet
Project
2 pages
Python NLP
No ratings yet
Python NLP
15 pages
NLP Lab Manual for CSE Students
No ratings yet
NLP Lab Manual for CSE Students
28 pages
Gen Ai-1
No ratings yet
Gen Ai-1
6 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
Text File Handling Programs
No ratings yet
Text File Handling Programs
4 pages
Lab Assignment-10
No ratings yet
Lab Assignment-10
1 page
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
Parts of Speech Tagger
No ratings yet
Parts of Speech Tagger
12 pages
Module 5
No ratings yet
Module 5
69 pages
Dataset Manager
No ratings yet
Dataset Manager
6 pages
ARC Prize 2025 Paper Submission
No ratings yet
ARC Prize 2025 Paper Submission
13 pages
Figure1 Belief in Pseudoscience
No ratings yet
Figure1 Belief in Pseudoscience
1 page
Research
No ratings yet
Research
9 pages
Project Org Chart Assignment Guide
No ratings yet
Project Org Chart Assignment Guide
5 pages
Mastering French Vocabulary PDF
33% (3)
Mastering French Vocabulary PDF
3 pages
Oracle Premium 1z0-1008 by - VCEplus 76q-DEMO
0% (1)
Oracle Premium 1z0-1008 by - VCEplus 76q-DEMO
23 pages
Melodyne Manual
No ratings yet
Melodyne Manual
222 pages
Wastewater Webinar Guide
No ratings yet
Wastewater Webinar Guide
3 pages
In-Year Secondary Interactive Application Form 1019
No ratings yet
In-Year Secondary Interactive Application Form 1019
9 pages
Media 1706091395
No ratings yet
Media 1706091395
5 pages
Dissertation Defense Made Easy
100% (2)
Dissertation Defense Made Easy
8 pages
Imperial College London PHD Thesis Latex Template
100% (2)
Imperial College London PHD Thesis Latex Template
6 pages
E Book
No ratings yet
E Book
6 pages
ID1e3eca04d-2013 Communication Question Paper n4
No ratings yet
ID1e3eca04d-2013 Communication Question Paper n4
2 pages
E-Invoice Process of The Finance Application On
No ratings yet
E-Invoice Process of The Finance Application On
9 pages
Circle of Fifth
No ratings yet
Circle of Fifth
3 pages
ILovePDF Online PDF Tools For PDF Lovers
100% (1)
ILovePDF Online PDF Tools For PDF Lovers
1 page
OPM1501 - Supplementary Jan - Feb 2025 Examination Paper - 2024
No ratings yet
OPM1501 - Supplementary Jan - Feb 2025 Examination Paper - 2024
8 pages
Report2 SDS PDF
No ratings yet
Report2 SDS PDF
48 pages
Extensible Markup Language FAQ From Jguru
No ratings yet
Extensible Markup Language FAQ From Jguru
200 pages
Students Turnitin FAQ
No ratings yet
Students Turnitin FAQ
2 pages
PCMate V 7 5 Printing Guide
No ratings yet
PCMate V 7 5 Printing Guide
89 pages
Mpls Enabled Applications Emerging Developments and New Technologies 3Rd Edition
No ratings yet
Mpls Enabled Applications Emerging Developments and New Technologies 3Rd Edition
2 pages
HCM V1R7 (Eeqzug40)
No ratings yet
HCM V1R7 (Eeqzug40)
493 pages
PDF Tools for Professionals
No ratings yet
PDF Tools for Professionals
21 pages
Graduate School Appraisal Form
No ratings yet
Graduate School Appraisal Form
2 pages
Pentaho Advanced Reporting Guide
No ratings yet
Pentaho Advanced Reporting Guide
64 pages
Iso 1856 2000
No ratings yet
Iso 1856 2000
9 pages
20aipc601 Rpa QB 19 03 25
No ratings yet
20aipc601 Rpa QB 19 03 25
22 pages
Ethical Hacking Countermeasures PDF
No ratings yet
Ethical Hacking Countermeasures PDF
2 pages
Wacom Sign Pro PDF-Admin Guide-V4.3
No ratings yet
Wacom Sign Pro PDF-Admin Guide-V4.3
7 pages
InsiteCreation 2010 User's Guide
No ratings yet
InsiteCreation 2010 User's Guide
40 pages
Epronotif 1
No ratings yet
Epronotif 1
34 pages

PDF Processor

Uploaded by

PDF Processor

Uploaded by

#!

This module processes PDF files for intent generation.

def process(self, file_path):

# Try to import PyMuPDF

# Open the PDF

# Extract basic information

# Split into sentences

# Extract key phrases

# Close the document

def _split_into_sentences(self, text):

# Simple sentence splitting

def _extract_key_phrases(self, sentences):

# Load spaCy model

for sentence in sentences[:100]: # Limit to first 100 sentences

# Extract noun phrases

# Extract verb phrases

# Return top phrases

# Count word frequencies

# Get common bigrams

# Return top phrases

You might also like