#!
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO Text Processor
This module processes text files for intent generation.
"""
import os
import logging
import re
from collections import Counter
class TextProcessor:
"""Processes text files for intent generation."""
def __init__(self):
"""Initialize the text processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None
def process(self, file_path):
"""
Process a text file.
Args:
file_path (str): Path to the text file
Returns:
str: Processed text
"""
try:
self.on_status(f"Processing text file: {os.path.basename(file_path)}")
self.on_progress(10)
# Read file
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
text = f.read()
self.on_progress(30)
# Basic preprocessing
self.on_status("Cleaning text...")
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Split into sentences
self.on_status("Splitting into sentences...")
sentences = self._split_into_sentences(text)
self.on_progress(70)
# Extract key phrases
self.on_status("Extracting key phrases...")
key_phrases = self._extract_key_phrases(sentences)
self.on_progress(90)
# Combine results
result = {
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}
self.on_progress(100)
self.on_status("Text processing complete")
return result
except Exception as e:
logging.error(f"Error processing text file: {str(e)}", exc_info=True)
raise
def _split_into_sentences(self, text):
"""
Split text into sentences.
Args:
text (str): Text to split
Returns:
list: List of sentences
"""
# Simple sentence splitting
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter out empty sentences
sentences = [s.strip() for s in sentences if s.strip()]
return sentences
def _extract_key_phrases(self, sentences):
"""
Extract key phrases from sentences.
Args:
sentences (list): List of sentences
Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy
# Load spaCy model
nlp = spacy.load("en_core_web_sm")
key_phrases = []
for sentence in sentences:
doc = nlp(sentence)
# Extract noun phrases
for chunk in doc.noun_chunks:
if len(chunk.text.split()) > 1: # Only multi-word phrases
key_phrases.append(chunk.text)
# Extract verb phrases
for token in doc:
if token.pos_ == "VERB":
phrase = token.text
for child in token.children:
if child.dep_ in ["dobj", "pobj"]:
phrase += " " + child.text
key_phrases.append(phrase)
return key_phrases
except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")
# Tokenize
words = []
for sentence in sentences:
words.extend(sentence.lower().split())
# Count word frequencies
word_counts = Counter(words)
# Get common bigrams
bigrams = []
for i in range(len(words) - 1):
bigrams.append(words[i] + " " + words[i + 1])
bigram_counts = Counter(bigrams)
# Return top phrases
return [phrase for phrase, count in bigram_counts.most_common(20)]