import pandas as pd
import re
import os
import google.generativeai as genai
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from chromadb import Client
# from dotenv import load_dotenv # No longer needed if using Colab secrets
from google.colab import userdata
# Load environment variables (using Colab secrets now)
API_KEY = userdata.get("GOOGLE_API_KEY")
if not API_KEY:
raise ValueError("GOOGLE_API_KEY not found. Please set it in Colab secrets.")
genai.configure(api_key=API_KEY)
# Use the working model
model = genai.GenerativeModel('gemini-1.5-flash-latest')
# Initialize components (assuming these are already initialized from previous steps)
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=100,
chunk_overlap=20,
length_function=len,
is_separator_regex=False,
)
# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize an in-memory ChromaDB client and get the collection
client = Client()
collection = client.get_or_create_collection(name="email_chunks")
# Define cleaning, extraction, and append functions (re-defining for clarity in this block)
def clean_email_content(email_content):
"""
Cleans and formats email content by removing HTML tags and special characters.
"""
cleaned_content = re.sub(r'<.*?>', '', email_content)
cleaned_content = re.sub(r'[^a-zA-Z0-9\s.,!?;:]', '', cleaned_content)
cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()
return cleaned_content
def extract_data_for_excel(email_content):
"""
Extracts sender, subject, content summary, and contact number from email content.
(Simplified for this example, assuming sender/subject are known from the email structure)
"""
extracted_data = {
'Sender': 'N/A',
'Subject': 'N/A',
'Summary': 'N/A',
'Contact': 'N/A'
extracted_data['Summary'] = email_content[:100] + '...' if len(email_content) > 100 else
email_content
contact_match = re.search(r'[\d\s-]{7,}', email_content)
if contact_match:
extracted_data['Contact'] = contact_match.group(0).strip()
return extracted_data
def append_to_excel(data_dict, file_path):
"""
Appends extracted email data to an Excel file.
"""
# Ensure the directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)
if not os.path.exists(file_path):
df = pd.DataFrame([data_dict])
df.to_excel(file_path, index=False)
else:
df = pd.DataFrame([data_dict])
with pd.ExcelWriter(file_path, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
df.to_excel(writer, index=False, header=False, startrow=writer.book.active.max_row)
def search_email_chunks(query_string, n_results=3):
"""
Searches the ChromaDB collection for email chunks relevant to a query.
"""
query_embedding = embedding_model.encode(query_string).tolist()
results = collection.query(
query_embeddings=[query_embedding],
n_results=n_results,
include=['documents', 'distances', 'metadatas']
)
return results
def generate_reply(original_email_content, retrieved_chunks):
"""
Generates a suitable email reply using the original email and retrieved context.
"""
prompt = f"""Original Email:
{original_email_content}
Relevant Context from past emails:
"""
for i, chunk in enumerate(retrieved_chunks):
prompt += f"Chunk {i+1}: {chunk}\n"
prompt += "\nBased on the original email and the relevant context, generate a concise and helpful
reply."
try:
response = model.generate_content(prompt)
return response.text
except Exception as e:
print(f"Error generating content: {e}")
return "Could not generate a reply at this time."
# Integrated function to process a new email
def process_new_email(new_email_content_raw, sender, subject,
excel_file_path='extracted_email_data.xlsx'):
"""
Simulates receiving and processing a new email through the RAG workflow.
"""
print(f"Processing new email from {sender} with subject: {subject}")
try:
# 1. Clean the email content
cleaned_content = clean_email_content(new_email_content_raw)
print("Email content cleaned.")
# 2. Chunk the cleaned content (optional for RAG query, but good for adding to DB if needed)
# chunks = text_splitter.split_text(cleaned_content)
# print(f"Email content chunked into {len(chunks)} chunks.")
# 3. Retrieve relevant context from the vector database
retrieval_results = search_email_chunks(cleaned_content)
retrieved_chunks = [doc for sublist in retrieval_results.get('documents', []) for doc in sublist]
print(f"Retrieved {len(retrieved_chunks)} relevant chunks.")
# 4. Generate a reply using the language model
generated_reply = generate_reply(cleaned_content, retrieved_chunks)
print("\nGenerated Reply:")
print(generated_reply)
# 5. Extract data for Excel and append
extracted_data = extract_data_for_excel(cleaned_content)
extracted_data['Sender'] = sender # Add sender from input
extracted_data['Subject'] = subject # Add subject from input
append_to_excel(extracted_data, excel_file_path)
print(f"\nExtracted data appended to {excel_file_path}.")
return generated_reply
except Exception as e:
print(f"\nAn error occurred during email processing: {e}")
return "An error occurred during processing."
# 3. Create a sample "new" email string
sample_new_email_content = """
Hello team,
Just following up on the project update meeting. Could someone confirm the deadline for phase 2
implementation?
Also, please let me know if you need anything from my side. You can reach me at 555-1234.
Thanks,
Alice
"""
sample_sender = "Alice"
sample_subject = "Follow up on Project Update"
# 4. Call the integrated function with the sample new email
print("--- Starting New Email Processing ---")
process_new_email(sample_new_email_content, sample_sender, sample_subject)
print("--- New Email Processing Finished ---")
# 5. Optionally, read and display the updated Excel file
try:
updated_extracted_df = pd.read_excel('extracted_email_data.xlsx')
print("\nUpdated Excel File Content:")
display(updated_extracted_df)
except FileNotFoundError:
print("\nExcel file not found after processing.")